blob: af05f4c569c61f5a396b1bd4e96b639e8c4525fe [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
109#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
110#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
111#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
112#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
113#define _PyUnicode_KIND(op) \
114 (assert(PyUnicode_Check(op)), \
115 ((PyASCIIObject *)(op))->state.kind)
116#define _PyUnicode_GET_LENGTH(op) \
117 (assert(PyUnicode_Check(op)), \
118 ((PyASCIIObject *)(op))->length)
119
120
Walter Dörwald16807132007-05-25 13:52:07 +0000121/* This dictionary holds all interned unicode strings. Note that references
122 to strings in this dictionary are *not* counted in the string's ob_refcnt.
123 When the interned string reaches a refcnt of 0 the string deallocation
124 function will delete the reference from this dictionary.
125
126 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000127 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000128*/
129static PyObject *interned;
130
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000131/* The empty Unicode object is shared to improve performance. */
132static PyUnicodeObject *unicode_empty;
133
134/* Single character Unicode strings in the Latin-1 range are being
135 shared as well. */
136static PyUnicodeObject *unicode_latin1[256];
137
Christian Heimes190d79e2008-01-30 11:58:22 +0000138/* Fast detection of the most frequent whitespace characters */
139const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000141/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000142/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000143/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000144/* case 0x000C: * FORM FEED */
145/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000146 0, 1, 1, 1, 1, 1, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000148/* case 0x001C: * FILE SEPARATOR */
149/* case 0x001D: * GROUP SEPARATOR */
150/* case 0x001E: * RECORD SEPARATOR */
151/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000152 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000154 1, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000158
Benjamin Peterson14339b62009-01-31 16:36:08 +0000159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000167};
168
Alexander Belopolsky40018472011-02-26 01:02:56 +0000169static PyObject *
170unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000171 PyObject **errorHandler,const char *encoding, const char *reason,
172 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
173 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
174
Alexander Belopolsky40018472011-02-26 01:02:56 +0000175static void
176raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300177 const char *encoding,
178 const Py_UNICODE *unicode, Py_ssize_t size,
179 Py_ssize_t startpos, Py_ssize_t endpos,
180 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000181
Christian Heimes190d79e2008-01-30 11:58:22 +0000182/* Same for linebreaks */
183static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000185/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000186/* 0x000B, * LINE TABULATION */
187/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000188/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* 0x001C, * FILE SEPARATOR */
192/* 0x001D, * GROUP SEPARATOR */
193/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 0, 0, 0, 1, 1, 1, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
196 0, 0, 0, 0, 0, 0, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000199
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000208};
209
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300210/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
211 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000212Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000213PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000214{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000215#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000217#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 /* This is actually an illegal character, so it should
219 not be passed to unichr. */
220 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000221#endif
222}
223
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224/* --- Bloom Filters ----------------------------------------------------- */
225
226/* stuff to implement simple "bloom filters" for Unicode characters.
227 to keep things simple, we use a single bitmask, using the least 5
228 bits from each unicode characters as the bit index. */
229
230/* the linebreak mask is set up by Unicode_Init below */
231
Antoine Pitrouf068f942010-01-13 14:19:12 +0000232#if LONG_BIT >= 128
233#define BLOOM_WIDTH 128
234#elif LONG_BIT >= 64
235#define BLOOM_WIDTH 64
236#elif LONG_BIT >= 32
237#define BLOOM_WIDTH 32
238#else
239#error "LONG_BIT is smaller than 32"
240#endif
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242#define BLOOM_MASK unsigned long
243
244static BLOOM_MASK bloom_linebreak;
245
Antoine Pitrouf068f942010-01-13 14:19:12 +0000246#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
247#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000248
Benjamin Peterson29060642009-01-31 22:14:21 +0000249#define BLOOM_LINEBREAK(ch) \
250 ((ch) < 128U ? ascii_linebreak[(ch)] : \
251 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000252
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200254make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000255{
256 /* calculate simple bloom-style bitmask for a given unicode string */
257
Antoine Pitrouf068f942010-01-13 14:19:12 +0000258 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259 Py_ssize_t i;
260
261 mask = 0;
262 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200263 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000264
265 return mask;
266}
267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200268#define BLOOM_MEMBER(mask, chr, str) \
269 (BLOOM(mask, chr) \
270 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272/* --- Unicode Object ----------------------------------------------------- */
273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200274static PyObject *
275substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
276
277static PyObject *
278fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
279
280Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
281 Py_ssize_t size, Py_UCS4 ch,
282 int direction)
283{
284 /* like wcschr, but doesn't stop at NULL characters */
285 Py_ssize_t i;
286 if (direction == 1) {
287 for(i = 0; i < size; i++)
288 if (PyUnicode_READ(kind, s, i) == ch)
289 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
290 }
291 else {
292 for(i = size-1; i >= 0; i--)
293 if (PyUnicode_READ(kind, s, i) == ch)
294 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
295 }
296 return NULL;
297}
298
Alexander Belopolsky40018472011-02-26 01:02:56 +0000299static int
300unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200301 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302{
303 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200305 /* Resizing is only supported for old unicode objects. */
306 assert(!PyUnicode_IS_COMPACT(unicode));
307 assert(_PyUnicode_WSTR(unicode) != NULL);
308
309 /* ... and only if they have not been readied yet, because
310 callees usually rely on the wstr representation when resizing. */
311 assert(unicode->data.any == NULL);
312
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000313 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200314 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000315 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000317 /* Resizing shared object (unicode_empty or single character
318 objects) in-place is not allowed. Use PyUnicode_Resize()
319 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000320
Benjamin Peterson14339b62009-01-31 16:36:08 +0000321 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200322 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
323 _PyUnicode_WSTR(unicode)[0] < 256U &&
324 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000325 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000326 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 return -1;
328 }
329
Thomas Wouters477c8d52006-05-27 19:21:47 +0000330 /* We allocate one more byte to make sure the string is Ux0000 terminated.
331 The overallocation is also used by fastsearch, which assumes that it's
332 safe to look at str[length] (without making any assumptions about what
333 it contains). */
334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200335 oldstr = _PyUnicode_WSTR(unicode);
336 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
337 sizeof(Py_UNICODE) * (length + 1));
338 if (!_PyUnicode_WSTR(unicode)) {
339 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 PyErr_NoMemory();
341 return -1;
342 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200343 _PyUnicode_WSTR(unicode)[length] = 0;
344 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345
Benjamin Peterson29060642009-01-31 22:14:21 +0000346 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200347 if (unicode->data.any != NULL) {
348 PyObject_FREE(unicode->data.any);
349 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
350 PyObject_FREE(unicode->_base.utf8);
351 }
352 unicode->_base.utf8 = NULL;
353 unicode->_base.utf8_length = 0;
354 unicode->data.any = NULL;
355 _PyUnicode_LENGTH(unicode) = 0;
356 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
357 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359 _PyUnicode_HASH(unicode) = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000360
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 return 0;
362}
363
364/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000365 Ux0000 terminated; some code (e.g. new_identifier)
366 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367
368 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000369 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370
371*/
372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200373#ifdef Py_DEBUG
374int unicode_old_new_calls = 0;
375#endif
376
Alexander Belopolsky40018472011-02-26 01:02:56 +0000377static PyUnicodeObject *
378_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379{
380 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200381 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382
Thomas Wouters477c8d52006-05-27 19:21:47 +0000383 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 if (length == 0 && unicode_empty != NULL) {
385 Py_INCREF(unicode_empty);
386 return unicode_empty;
387 }
388
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000389 /* Ensure we won't overflow the size. */
390 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
391 return (PyUnicodeObject *)PyErr_NoMemory();
392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200393 if (length < 0) {
394 PyErr_SetString(PyExc_SystemError,
395 "Negative size passed to _PyUnicode_New");
396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397 }
398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200399#ifdef Py_DEBUG
400 ++unicode_old_new_calls;
401#endif
402
403 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
404 if (unicode == NULL)
405 return NULL;
406 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
407 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
408 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 PyErr_NoMemory();
410 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200412
Jeremy Hyltond8082792003-09-16 19:41:39 +0000413 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000414 * the caller fails before initializing str -- unicode_resize()
415 * reads str[0], and the Keep-Alive optimization can keep memory
416 * allocated for str alive across a call to unicode_dealloc(unicode).
417 * We don't want unicode_resize to read uninitialized memory in
418 * that case.
419 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200420 _PyUnicode_WSTR(unicode)[0] = 0;
421 _PyUnicode_WSTR(unicode)[length] = 0;
422 _PyUnicode_WSTR_LENGTH(unicode) = length;
423 _PyUnicode_HASH(unicode) = -1;
424 _PyUnicode_STATE(unicode).interned = 0;
425 _PyUnicode_STATE(unicode).kind = 0;
426 _PyUnicode_STATE(unicode).compact = 0;
427 _PyUnicode_STATE(unicode).ready = 0;
428 _PyUnicode_STATE(unicode).ascii = 0;
429 unicode->data.any = NULL;
430 _PyUnicode_LENGTH(unicode) = 0;
431 unicode->_base.utf8 = NULL;
432 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000433 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000434
Benjamin Peterson29060642009-01-31 22:14:21 +0000435 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000436 /* XXX UNREF/NEWREF interface should be more symmetrical */
437 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000438 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000439 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000440 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441}
442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200443#ifdef Py_DEBUG
444int unicode_new_new_calls = 0;
445
446/* Functions wrapping macros for use in debugger */
447char *_PyUnicode_utf8(void *unicode){
448 return _PyUnicode_UTF8(unicode);
449}
450
451void *_PyUnicode_compact_data(void *unicode) {
452 return _PyUnicode_COMPACT_DATA(unicode);
453}
454void *_PyUnicode_data(void *unicode){
455 printf("obj %p\n", unicode);
456 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
457 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
458 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
459 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
460 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
461 return PyUnicode_DATA(unicode);
462}
463#endif
464
465PyObject *
466PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
467{
468 PyObject *obj;
469 PyCompactUnicodeObject *unicode;
470 void *data;
471 int kind_state;
472 int is_sharing = 0, is_ascii = 0;
473 Py_ssize_t char_size;
474 Py_ssize_t struct_size;
475
476 /* Optimization for empty strings */
477 if (size == 0 && unicode_empty != NULL) {
478 Py_INCREF(unicode_empty);
479 return (PyObject *)unicode_empty;
480 }
481
482#ifdef Py_DEBUG
483 ++unicode_new_new_calls;
484#endif
485
486 struct_size = sizeof(PyCompactUnicodeObject);
487 if (maxchar < 128) {
488 kind_state = PyUnicode_1BYTE_KIND;
489 char_size = 1;
490 is_ascii = 1;
491 struct_size = sizeof(PyASCIIObject);
492 }
493 else if (maxchar < 256) {
494 kind_state = PyUnicode_1BYTE_KIND;
495 char_size = 1;
496 }
497 else if (maxchar < 65536) {
498 kind_state = PyUnicode_2BYTE_KIND;
499 char_size = 2;
500 if (sizeof(wchar_t) == 2)
501 is_sharing = 1;
502 }
503 else {
504 kind_state = PyUnicode_4BYTE_KIND;
505 char_size = 4;
506 if (sizeof(wchar_t) == 4)
507 is_sharing = 1;
508 }
509
510 /* Ensure we won't overflow the size. */
511 if (size < 0) {
512 PyErr_SetString(PyExc_SystemError,
513 "Negative size passed to PyUnicode_New");
514 return NULL;
515 }
516 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
517 return PyErr_NoMemory();
518
519 /* Duplicated allocation code from _PyObject_New() instead of a call to
520 * PyObject_New() so we are able to allocate space for the object and
521 * it's data buffer.
522 */
523 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
524 if (obj == NULL)
525 return PyErr_NoMemory();
526 obj = PyObject_INIT(obj, &PyUnicode_Type);
527 if (obj == NULL)
528 return NULL;
529
530 unicode = (PyCompactUnicodeObject *)obj;
531 if (is_ascii)
532 data = ((PyASCIIObject*)obj) + 1;
533 else
534 data = unicode + 1;
535 _PyUnicode_LENGTH(unicode) = size;
536 _PyUnicode_HASH(unicode) = -1;
537 _PyUnicode_STATE(unicode).interned = 0;
538 _PyUnicode_STATE(unicode).kind = kind_state;
539 _PyUnicode_STATE(unicode).compact = 1;
540 _PyUnicode_STATE(unicode).ready = 1;
541 _PyUnicode_STATE(unicode).ascii = is_ascii;
542 if (is_ascii) {
543 ((char*)data)[size] = 0;
544 _PyUnicode_WSTR(unicode) = NULL;
545 }
546 else if (kind_state == PyUnicode_1BYTE_KIND) {
547 ((char*)data)[size] = 0;
548 _PyUnicode_WSTR(unicode) = NULL;
549 _PyUnicode_WSTR_LENGTH(unicode) = 0;
550 unicode->utf8_length = 0;
551 unicode->utf8 = NULL;
552 }
553 else {
554 unicode->utf8 = NULL;
555 if (kind_state == PyUnicode_2BYTE_KIND)
556 ((Py_UCS2*)data)[size] = 0;
557 else /* kind_state == PyUnicode_4BYTE_KIND */
558 ((Py_UCS4*)data)[size] = 0;
559 if (is_sharing) {
560 _PyUnicode_WSTR_LENGTH(unicode) = size;
561 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
562 }
563 else {
564 _PyUnicode_WSTR_LENGTH(unicode) = 0;
565 _PyUnicode_WSTR(unicode) = NULL;
566 }
567 }
568 return obj;
569}
570
571#if SIZEOF_WCHAR_T == 2
572/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
573 will decode surrogate pairs, the other conversions are implemented as macros
574 for efficency.
575
576 This function assumes that unicode can hold one more code point than wstr
577 characters for a terminating null character. */
578static int
579unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
580 PyUnicodeObject *unicode)
581{
582 const wchar_t *iter;
583 Py_UCS4 *ucs4_out;
584
585 assert(unicode && PyUnicode_Check(unicode));
586 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
587 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
588
589 for (iter = begin; iter < end; ) {
590 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
591 _PyUnicode_GET_LENGTH(unicode)));
592 if (*iter >= 0xD800 && *iter <= 0xDBFF
593 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
594 {
595 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
596 iter += 2;
597 }
598 else {
599 *ucs4_out++ = *iter;
600 iter++;
601 }
602 }
603 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
604 _PyUnicode_GET_LENGTH(unicode)));
605
606 return 0;
607}
608#endif
609
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200610Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
612 PyObject *from, Py_ssize_t from_start,
613 Py_ssize_t how_many)
614{
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200615 unsigned int from_kind;
616 unsigned int to_kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200617
618 assert(PyUnicode_Check(from));
619 assert(PyUnicode_Check(to));
620
621 if (PyUnicode_READY(from))
622 return -1;
623 if (PyUnicode_READY(to))
624 return -1;
625
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200626 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200627 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
628 PyErr_Format(PyExc_ValueError,
629 "Cannot write %zi characters at %zi "
630 "in a string of %zi characters",
631 how_many, to_start, PyUnicode_GET_LENGTH(to));
632 return -1;
633 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200634 if (how_many == 0)
635 return 0;
636
637 if (Py_REFCNT(to) != 1) {
638 PyErr_SetString(PyExc_ValueError,
639 "Cannot modify a string having more than 1 reference");
640 return -1;
641 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643 from_kind = PyUnicode_KIND(from);
644 to_kind = PyUnicode_KIND(to);
645
646 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200647 /* fast path */
648 Py_MEMCPY((char*)PyUnicode_DATA(to)
649 + PyUnicode_KIND_SIZE(to_kind, to_start),
650 (char*)PyUnicode_DATA(from)
651 + PyUnicode_KIND_SIZE(from_kind, from_start),
652 PyUnicode_KIND_SIZE(to_kind, how_many));
653 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200655
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200656 if (from_kind > to_kind) {
657 /* slow path to check for character overflow */
658 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
659 void *from_data = PyUnicode_DATA(from);
660 void *to_data = PyUnicode_DATA(to);
661 Py_UCS4 ch, maxchar;
662 Py_ssize_t i;
663 int overflow;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200664
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200665 maxchar = 0;
Victor Stinner73f01c62011-09-28 22:28:04 +0200666 overflow = 0;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200667 for (i=0; i < how_many; i++) {
668 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
669 if (ch > maxchar) {
670 maxchar = ch;
671 if (maxchar > to_maxchar) {
672 overflow = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200673 break;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200675 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200676 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
677 }
678 if (!overflow)
679 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200680 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200681 else if (from_kind == PyUnicode_1BYTE_KIND && to_kind == PyUnicode_2BYTE_KIND)
682 {
683 _PyUnicode_CONVERT_BYTES(
684 Py_UCS1, Py_UCS2,
685 PyUnicode_1BYTE_DATA(from) + from_start,
686 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
687 PyUnicode_2BYTE_DATA(to) + to_start
688 );
689 return how_many;
690 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200691 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200692 && to_kind == PyUnicode_4BYTE_KIND)
693 {
694 _PyUnicode_CONVERT_BYTES(
695 Py_UCS1, Py_UCS4,
696 PyUnicode_1BYTE_DATA(from) + from_start,
697 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
698 PyUnicode_4BYTE_DATA(to) + to_start
699 );
700 return how_many;
701 }
702 else if (from_kind == PyUnicode_2BYTE_KIND
703 && to_kind == PyUnicode_4BYTE_KIND)
704 {
705 _PyUnicode_CONVERT_BYTES(
706 Py_UCS2, Py_UCS4,
707 PyUnicode_2BYTE_DATA(from) + from_start,
708 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
709 PyUnicode_4BYTE_DATA(to) + to_start
710 );
711 return how_many;
712 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200713 PyErr_Format(PyExc_ValueError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200714 "Cannot copy UCS%u characters "
715 "into a string of UCS%u characters",
Victor Stinner157f83f2011-09-28 21:41:31 +0200716 1 << (from_kind - 1),
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200717 1 << (to_kind -1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200718 return -1;
719}
720
Victor Stinner17222162011-09-28 22:15:37 +0200721/* Find the maximum code point and count the number of surrogate pairs so a
722 correct string length can be computed before converting a string to UCS4.
723 This function counts single surrogates as a character and not as a pair.
724
725 Return 0 on success, or -1 on error. */
726static int
727find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
728 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729{
730 const wchar_t *iter;
731
732 if (num_surrogates == NULL || maxchar == NULL) {
733 PyErr_SetString(PyExc_SystemError,
734 "unexpected NULL arguments to "
735 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
736 return -1;
737 }
738
739 *num_surrogates = 0;
740 *maxchar = 0;
741
742 for (iter = begin; iter < end; ) {
743 if (*iter > *maxchar)
744 *maxchar = *iter;
745#if SIZEOF_WCHAR_T == 2
746 if (*iter >= 0xD800 && *iter <= 0xDBFF
747 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
748 {
749 Py_UCS4 surrogate_val;
750 surrogate_val = (((iter[0] & 0x3FF)<<10)
751 | (iter[1] & 0x3FF)) + 0x10000;
752 ++(*num_surrogates);
753 if (surrogate_val > *maxchar)
754 *maxchar = surrogate_val;
755 iter += 2;
756 }
757 else
758 iter++;
759#else
760 iter++;
761#endif
762 }
763 return 0;
764}
765
766#ifdef Py_DEBUG
767int unicode_ready_calls = 0;
768#endif
769
770int
771_PyUnicode_Ready(PyUnicodeObject *unicode)
772{
773 wchar_t *end;
774 Py_UCS4 maxchar = 0;
775 Py_ssize_t num_surrogates;
776#if SIZEOF_WCHAR_T == 2
777 Py_ssize_t length_wo_surrogates;
778#endif
779
780 assert(PyUnicode_Check(unicode));
781
782 if (unicode->data.any != NULL) {
783 assert(PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
784 return 0;
785 }
786
787 /* _PyUnicode_Ready() is only intented for old-style API usage where
788 * strings were created using _PyObject_New() and where no canonical
789 * representation (the str field) has been set yet aka strings
790 * which are not yet ready.
791 */
792 assert(_PyUnicode_WSTR(unicode) != NULL);
793 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
794 assert(!PyUnicode_IS_COMPACT(unicode));
795 assert(!PyUnicode_IS_READY(unicode));
796 /* Actually, it should neither be interned nor be anything else: */
797 assert(_PyUnicode_STATE(unicode).interned == 0);
798 assert(unicode->_base.utf8 == NULL);
799
800#ifdef Py_DEBUG
801 ++unicode_ready_calls;
802#endif
803
804 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200805 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200806 &maxchar,
807 &num_surrogates) == -1) {
808 assert(0 && "PyUnicode_FindMaxCharAndNumSurrogatePairs failed");
809 return -1;
810 }
811
812 if (maxchar < 256) {
813 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
814 if (!unicode->data.any) {
815 PyErr_NoMemory();
816 return -1;
817 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200818 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200819 _PyUnicode_WSTR(unicode), end,
820 PyUnicode_1BYTE_DATA(unicode));
821 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
822 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
823 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
824 if (maxchar < 128) {
825 unicode->_base.utf8 = unicode->data.any;
826 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
827 }
828 else {
829 unicode->_base.utf8 = NULL;
830 unicode->_base.utf8_length = 0;
831 }
832 PyObject_FREE(_PyUnicode_WSTR(unicode));
833 _PyUnicode_WSTR(unicode) = NULL;
834 _PyUnicode_WSTR_LENGTH(unicode) = 0;
835 }
836 /* In this case we might have to convert down from 4-byte native
837 wchar_t to 2-byte unicode. */
838 else if (maxchar < 65536) {
839 assert(num_surrogates == 0 &&
840 "FindMaxCharAndNumSurrogatePairs() messed up");
841
Victor Stinner506f5922011-09-28 22:34:18 +0200842#if SIZEOF_WCHAR_T == 2
843 /* We can share representations and are done. */
844 unicode->data.any = _PyUnicode_WSTR(unicode);
845 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
846 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
847 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
848 unicode->_base.utf8 = NULL;
849 unicode->_base.utf8_length = 0;
850#else
851 /* sizeof(wchar_t) == 4 */
852 unicode->data.any = PyObject_MALLOC(
853 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
854 if (!unicode->data.any) {
855 PyErr_NoMemory();
856 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200857 }
Victor Stinner506f5922011-09-28 22:34:18 +0200858 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
859 _PyUnicode_WSTR(unicode), end,
860 PyUnicode_2BYTE_DATA(unicode));
861 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
862 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
863 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
864 unicode->_base.utf8 = NULL;
865 unicode->_base.utf8_length = 0;
866 PyObject_FREE(_PyUnicode_WSTR(unicode));
867 _PyUnicode_WSTR(unicode) = NULL;
868 _PyUnicode_WSTR_LENGTH(unicode) = 0;
869#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870 }
871 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
872 else {
873#if SIZEOF_WCHAR_T == 2
874 /* in case the native representation is 2-bytes, we need to allocate a
875 new normalized 4-byte version. */
876 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
877 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
878 if (!unicode->data.any) {
879 PyErr_NoMemory();
880 return -1;
881 }
882 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
883 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
884 unicode->_base.utf8 = NULL;
885 unicode->_base.utf8_length = 0;
886 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
887 unicode) < 0) {
888 assert(0 && "ConvertWideCharToUCS4 failed");
889 return -1;
890 }
891 PyObject_FREE(_PyUnicode_WSTR(unicode));
892 _PyUnicode_WSTR(unicode) = NULL;
893 _PyUnicode_WSTR_LENGTH(unicode) = 0;
894#else
895 assert(num_surrogates == 0);
896
897 unicode->data.any = _PyUnicode_WSTR(unicode);
898 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
899 unicode->_base.utf8 = NULL;
900 unicode->_base.utf8_length = 0;
901 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
902#endif
903 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
904 }
905 _PyUnicode_STATE(unicode).ready = 1;
906 return 0;
907}
908
Alexander Belopolsky40018472011-02-26 01:02:56 +0000909static void
910unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911{
Walter Dörwald16807132007-05-25 13:52:07 +0000912 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000913 case SSTATE_NOT_INTERNED:
914 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000915
Benjamin Peterson29060642009-01-31 22:14:21 +0000916 case SSTATE_INTERNED_MORTAL:
917 /* revive dead object temporarily for DelItem */
918 Py_REFCNT(unicode) = 3;
919 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
920 Py_FatalError(
921 "deletion of interned string failed");
922 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000923
Benjamin Peterson29060642009-01-31 22:14:21 +0000924 case SSTATE_INTERNED_IMMORTAL:
925 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000926
Benjamin Peterson29060642009-01-31 22:14:21 +0000927 default:
928 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000929 }
930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 if (_PyUnicode_WSTR(unicode) &&
932 (!PyUnicode_IS_READY(unicode) ||
933 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
934 PyObject_DEL(_PyUnicode_WSTR(unicode));
935 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
936 PyObject_DEL(unicode->_base.utf8);
937
938 if (PyUnicode_IS_COMPACT(unicode)) {
939 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940 }
941 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200942 if (unicode->data.any)
943 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000944 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000945 }
946}
947
Alexander Belopolsky40018472011-02-26 01:02:56 +0000948static int
949_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000950{
951 register PyUnicodeObject *v;
952
953 /* Argument checks */
954 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000955 PyErr_BadInternalCall();
956 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000957 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000958 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200959 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
960 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000961 PyErr_BadInternalCall();
962 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000963 }
964
965 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 possible since these are being shared.
967 The same goes for new-representation unicode objects or objects which
968 have already been readied.
969 For these, we simply return a fresh copy with the same Unicode content.
970 */
971 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
972 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
973 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000974 PyUnicodeObject *w = _PyUnicode_New(length);
975 if (w == NULL)
976 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
978 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000979 Py_DECREF(*unicode);
980 *unicode = w;
981 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000982 }
983
984 /* Note that we don't have to modify *unicode for unshared Unicode
985 objects, since we can modify them in-place. */
986 return unicode_resize(v, length);
987}
988
Alexander Belopolsky40018472011-02-26 01:02:56 +0000989int
990PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000991{
992 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
993}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995static PyObject*
996get_latin1_char(unsigned char ch)
997{
998 PyUnicodeObject *unicode = unicode_latin1[ch];
999 if (!unicode) {
1000 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1001 if (!unicode)
1002 return NULL;
1003 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1004 unicode_latin1[ch] = unicode;
1005 }
1006 Py_INCREF(unicode);
1007 return (PyObject *)unicode;
1008}
1009
Alexander Belopolsky40018472011-02-26 01:02:56 +00001010PyObject *
1011PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001012{
1013 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001014 Py_UCS4 maxchar = 0;
1015 Py_ssize_t num_surrogates;
1016
1017 if (u == NULL)
1018 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001020 /* If the Unicode data is known at construction time, we can apply
1021 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001023 /* Optimization for empty strings */
1024 if (size == 0 && unicode_empty != NULL) {
1025 Py_INCREF(unicode_empty);
1026 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001027 }
Tim Petersced69f82003-09-16 20:30:58 +00001028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001029 /* Single character Unicode objects in the Latin-1 range are
1030 shared when using this constructor */
1031 if (size == 1 && *u < 256)
1032 return get_latin1_char((unsigned char)*u);
1033
1034 /* If not empty and not single character, copy the Unicode data
1035 into the new object */
Victor Stinner17222162011-09-28 22:15:37 +02001036 if (find_maxchar_surrogates(u, u + size, &maxchar,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 &num_surrogates) == -1)
1038 return NULL;
1039
1040 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1041 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042 if (!unicode)
1043 return NULL;
1044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 switch (PyUnicode_KIND(unicode)) {
1046 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001047 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1049 break;
1050 case PyUnicode_2BYTE_KIND:
1051#if Py_UNICODE_SIZE == 2
1052 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1053#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001054 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1056#endif
1057 break;
1058 case PyUnicode_4BYTE_KIND:
1059#if SIZEOF_WCHAR_T == 2
1060 /* This is the only case which has to process surrogates, thus
1061 a simple copy loop is not enough and we need a function. */
1062 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1063 Py_DECREF(unicode);
1064 return NULL;
1065 }
1066#else
1067 assert(num_surrogates == 0);
1068 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1069#endif
1070 break;
1071 default:
1072 assert(0 && "Impossible state");
1073 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074
1075 return (PyObject *)unicode;
1076}
1077
Alexander Belopolsky40018472011-02-26 01:02:56 +00001078PyObject *
1079PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001080{
1081 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001082
Benjamin Peterson14339b62009-01-31 16:36:08 +00001083 if (size < 0) {
1084 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001085 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001086 return NULL;
1087 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001088
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001089 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001090 some optimizations which share commonly used objects.
1091 Also, this means the input must be UTF-8, so fall back to the
1092 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001093 if (u != NULL) {
1094
Benjamin Peterson29060642009-01-31 22:14:21 +00001095 /* Optimization for empty strings */
1096 if (size == 0 && unicode_empty != NULL) {
1097 Py_INCREF(unicode_empty);
1098 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001099 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001100
1101 /* Single characters are shared when using this constructor.
1102 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 if (size == 1 && Py_CHARMASK(*u) < 128)
1104 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001105
1106 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001107 }
1108
Walter Dörwald55507312007-05-18 13:12:10 +00001109 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001110 if (!unicode)
1111 return NULL;
1112
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001113 return (PyObject *)unicode;
1114}
1115
Alexander Belopolsky40018472011-02-26 01:02:56 +00001116PyObject *
1117PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001118{
1119 size_t size = strlen(u);
1120 if (size > PY_SSIZE_T_MAX) {
1121 PyErr_SetString(PyExc_OverflowError, "input too long");
1122 return NULL;
1123 }
1124
1125 return PyUnicode_FromStringAndSize(u, size);
1126}
1127
Victor Stinnere57b1c02011-09-28 22:20:48 +02001128static PyObject*
1129_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001130{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131 PyObject *res;
1132 unsigned char max = 127;
1133 Py_ssize_t i;
1134 for (i = 0; i < size; i++) {
1135 if (u[i] & 0x80) {
1136 max = 255;
1137 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001138 }
1139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001140 res = PyUnicode_New(size, max);
1141 if (!res)
1142 return NULL;
1143 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1144 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001145}
1146
Victor Stinnere57b1c02011-09-28 22:20:48 +02001147static PyObject*
1148_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149{
1150 PyObject *res;
1151 Py_UCS2 max = 0;
1152 Py_ssize_t i;
1153 for (i = 0; i < size; i++)
1154 if (u[i] > max)
1155 max = u[i];
1156 res = PyUnicode_New(size, max);
1157 if (!res)
1158 return NULL;
1159 if (max >= 256)
1160 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1161 else
1162 for (i = 0; i < size; i++)
1163 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1164 return res;
1165}
1166
Victor Stinnere57b1c02011-09-28 22:20:48 +02001167static PyObject*
1168_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169{
1170 PyObject *res;
1171 Py_UCS4 max = 0;
1172 Py_ssize_t i;
1173 for (i = 0; i < size; i++)
1174 if (u[i] > max)
1175 max = u[i];
1176 res = PyUnicode_New(size, max);
1177 if (!res)
1178 return NULL;
1179 if (max >= 0x10000)
1180 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1181 else {
1182 int kind = PyUnicode_KIND(res);
1183 void *data = PyUnicode_DATA(res);
1184 for (i = 0; i < size; i++)
1185 PyUnicode_WRITE(kind, data, i, u[i]);
1186 }
1187 return res;
1188}
1189
1190PyObject*
1191PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1192{
1193 switch(kind) {
1194 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001195 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001196 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001197 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001198 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001199 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 }
1201 assert(0);
1202 return NULL;
1203}
1204
1205
1206/* Widen Unicode objects to larger buffers.
1207 Return NULL if the string is too wide already. */
1208
1209void*
1210_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1211{
1212 Py_ssize_t i;
1213 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1214 void *d = PyUnicode_DATA(s);
1215 unsigned int skind = PyUnicode_KIND(s);
1216 if (PyUnicode_KIND(s) >= kind) {
1217 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1218 return NULL;
1219 }
1220 switch(kind) {
1221 case PyUnicode_2BYTE_KIND: {
1222 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1223 if (!result) {
1224 PyErr_NoMemory();
1225 return 0;
1226 }
1227 for (i = 0; i < len; i++)
1228 result[i] = ((Py_UCS1*)d)[i];
1229 return result;
1230 }
1231 case PyUnicode_4BYTE_KIND: {
1232 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1233 if (!result) {
1234 PyErr_NoMemory();
1235 return 0;
1236 }
1237 for (i = 0; i < len; i++)
1238 result[i] = PyUnicode_READ(skind, d, i);
1239 return result;
1240 }
1241 }
1242 Py_FatalError("invalid kind");
1243 return NULL;
1244}
1245
1246static Py_UCS4*
1247as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1248 int copy_null)
1249{
1250 int kind;
1251 void *data;
1252 Py_ssize_t len, targetlen;
1253 if (PyUnicode_READY(string) == -1)
1254 return NULL;
1255 kind = PyUnicode_KIND(string);
1256 data = PyUnicode_DATA(string);
1257 len = PyUnicode_GET_LENGTH(string);
1258 targetlen = len;
1259 if (copy_null)
1260 targetlen++;
1261 if (!target) {
1262 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1263 PyErr_NoMemory();
1264 return NULL;
1265 }
1266 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1267 if (!target) {
1268 PyErr_NoMemory();
1269 return NULL;
1270 }
1271 }
1272 else {
1273 if (targetsize < targetlen) {
1274 PyErr_Format(PyExc_SystemError,
1275 "string is longer than the buffer");
1276 if (copy_null && 0 < targetsize)
1277 target[0] = 0;
1278 return NULL;
1279 }
1280 }
1281 if (kind != PyUnicode_4BYTE_KIND) {
1282 Py_ssize_t i;
1283 for (i = 0; i < len; i++)
1284 target[i] = PyUnicode_READ(kind, data, i);
1285 }
1286 else
1287 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1288 if (copy_null)
1289 target[len] = 0;
1290 return target;
1291}
1292
1293Py_UCS4*
1294PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1295 int copy_null)
1296{
1297 if (target == NULL || targetsize < 1) {
1298 PyErr_BadInternalCall();
1299 return NULL;
1300 }
1301 return as_ucs4(string, target, targetsize, copy_null);
1302}
1303
1304Py_UCS4*
1305PyUnicode_AsUCS4Copy(PyObject *string)
1306{
1307 return as_ucs4(string, NULL, 0, 1);
1308}
1309
1310#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001311
Alexander Belopolsky40018472011-02-26 01:02:56 +00001312PyObject *
1313PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001315 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001316 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001318 PyErr_BadInternalCall();
1319 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 }
1321
Martin v. Löwis790465f2008-04-05 20:41:37 +00001322 if (size == -1) {
1323 size = wcslen(w);
1324 }
1325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327}
1328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001330
Walter Dörwald346737f2007-05-31 10:44:43 +00001331static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001332makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1333 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001334{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001335 *fmt++ = '%';
1336 if (width) {
1337 if (zeropad)
1338 *fmt++ = '0';
1339 fmt += sprintf(fmt, "%d", width);
1340 }
1341 if (precision)
1342 fmt += sprintf(fmt, ".%d", precision);
1343 if (longflag)
1344 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001345 else if (longlongflag) {
1346 /* longlongflag should only ever be nonzero on machines with
1347 HAVE_LONG_LONG defined */
1348#ifdef HAVE_LONG_LONG
1349 char *f = PY_FORMAT_LONG_LONG;
1350 while (*f)
1351 *fmt++ = *f++;
1352#else
1353 /* we shouldn't ever get here */
1354 assert(0);
1355 *fmt++ = 'l';
1356#endif
1357 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001358 else if (size_tflag) {
1359 char *f = PY_FORMAT_SIZE_T;
1360 while (*f)
1361 *fmt++ = *f++;
1362 }
1363 *fmt++ = c;
1364 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001365}
1366
Victor Stinner96865452011-03-01 23:44:09 +00001367/* helper for PyUnicode_FromFormatV() */
1368
1369static const char*
1370parse_format_flags(const char *f,
1371 int *p_width, int *p_precision,
1372 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1373{
1374 int width, precision, longflag, longlongflag, size_tflag;
1375
1376 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1377 f++;
1378 width = 0;
1379 while (Py_ISDIGIT((unsigned)*f))
1380 width = (width*10) + *f++ - '0';
1381 precision = 0;
1382 if (*f == '.') {
1383 f++;
1384 while (Py_ISDIGIT((unsigned)*f))
1385 precision = (precision*10) + *f++ - '0';
1386 if (*f == '%') {
1387 /* "%.3%s" => f points to "3" */
1388 f--;
1389 }
1390 }
1391 if (*f == '\0') {
1392 /* bogus format "%.1" => go backward, f points to "1" */
1393 f--;
1394 }
1395 if (p_width != NULL)
1396 *p_width = width;
1397 if (p_precision != NULL)
1398 *p_precision = precision;
1399
1400 /* Handle %ld, %lu, %lld and %llu. */
1401 longflag = 0;
1402 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001403 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001404
1405 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001406 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001407 longflag = 1;
1408 ++f;
1409 }
1410#ifdef HAVE_LONG_LONG
1411 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001412 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001413 longlongflag = 1;
1414 f += 2;
1415 }
1416#endif
1417 }
1418 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001419 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001420 size_tflag = 1;
1421 ++f;
1422 }
1423 if (p_longflag != NULL)
1424 *p_longflag = longflag;
1425 if (p_longlongflag != NULL)
1426 *p_longlongflag = longlongflag;
1427 if (p_size_tflag != NULL)
1428 *p_size_tflag = size_tflag;
1429 return f;
1430}
1431
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001432/* maximum number of characters required for output of %ld. 21 characters
1433 allows for 64-bit integers (in decimal) and an optional sign. */
1434#define MAX_LONG_CHARS 21
1435/* maximum number of characters required for output of %lld.
1436 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1437 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1438#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1439
Walter Dörwaldd2034312007-05-18 16:29:38 +00001440PyObject *
1441PyUnicode_FromFormatV(const char *format, va_list vargs)
1442{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001443 va_list count;
1444 Py_ssize_t callcount = 0;
1445 PyObject **callresults = NULL;
1446 PyObject **callresult = NULL;
1447 Py_ssize_t n = 0;
1448 int width = 0;
1449 int precision = 0;
1450 int zeropad;
1451 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001453 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001454 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1456 Py_UCS4 argmaxchar;
1457 Py_ssize_t numbersize = 0;
1458 char *numberresults = NULL;
1459 char *numberresult = NULL;
1460 Py_ssize_t i;
1461 int kind;
1462 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001463
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001464 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001465 /* step 1: count the number of %S/%R/%A/%s format specifications
1466 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1467 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 * result in an array)
1469 * also esimate a upper bound for all the number formats in the string,
1470 * numbers will be formated in step 3 and be keept in a '\0'-separated
1471 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001472 for (f = format; *f; f++) {
1473 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001474 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1476 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1477 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1478 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001481#ifdef HAVE_LONG_LONG
1482 if (longlongflag) {
1483 if (width < MAX_LONG_LONG_CHARS)
1484 width = MAX_LONG_LONG_CHARS;
1485 }
1486 else
1487#endif
1488 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1489 including sign. Decimal takes the most space. This
1490 isn't enough for octal. If a width is specified we
1491 need more (which we allocate later). */
1492 if (width < MAX_LONG_CHARS)
1493 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494
1495 /* account for the size + '\0' to separate numbers
1496 inside of the numberresults buffer */
1497 numbersize += (width + 1);
1498 }
1499 }
1500 else if ((unsigned char)*f > 127) {
1501 PyErr_Format(PyExc_ValueError,
1502 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1503 "string, got a non-ASCII byte: 0x%02x",
1504 (unsigned char)*f);
1505 return NULL;
1506 }
1507 }
1508 /* step 2: allocate memory for the results of
1509 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1510 if (callcount) {
1511 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1512 if (!callresults) {
1513 PyErr_NoMemory();
1514 return NULL;
1515 }
1516 callresult = callresults;
1517 }
1518 /* step 2.5: allocate memory for the results of formating numbers */
1519 if (numbersize) {
1520 numberresults = PyObject_Malloc(numbersize);
1521 if (!numberresults) {
1522 PyErr_NoMemory();
1523 goto fail;
1524 }
1525 numberresult = numberresults;
1526 }
1527
1528 /* step 3: format numbers and figure out how large a buffer we need */
1529 for (f = format; *f; f++) {
1530 if (*f == '%') {
1531 const char* p;
1532 int longflag;
1533 int longlongflag;
1534 int size_tflag;
1535 int numprinted;
1536
1537 p = f;
1538 zeropad = (f[1] == '0');
1539 f = parse_format_flags(f, &width, &precision,
1540 &longflag, &longlongflag, &size_tflag);
1541 switch (*f) {
1542 case 'c':
1543 {
1544 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001545 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001546 n++;
1547 break;
1548 }
1549 case '%':
1550 n++;
1551 break;
1552 case 'i':
1553 case 'd':
1554 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1555 width, precision, *f);
1556 if (longflag)
1557 numprinted = sprintf(numberresult, fmt,
1558 va_arg(count, long));
1559#ifdef HAVE_LONG_LONG
1560 else if (longlongflag)
1561 numprinted = sprintf(numberresult, fmt,
1562 va_arg(count, PY_LONG_LONG));
1563#endif
1564 else if (size_tflag)
1565 numprinted = sprintf(numberresult, fmt,
1566 va_arg(count, Py_ssize_t));
1567 else
1568 numprinted = sprintf(numberresult, fmt,
1569 va_arg(count, int));
1570 n += numprinted;
1571 /* advance by +1 to skip over the '\0' */
1572 numberresult += (numprinted + 1);
1573 assert(*(numberresult - 1) == '\0');
1574 assert(*(numberresult - 2) != '\0');
1575 assert(numprinted >= 0);
1576 assert(numberresult <= numberresults + numbersize);
1577 break;
1578 case 'u':
1579 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1580 width, precision, 'u');
1581 if (longflag)
1582 numprinted = sprintf(numberresult, fmt,
1583 va_arg(count, unsigned long));
1584#ifdef HAVE_LONG_LONG
1585 else if (longlongflag)
1586 numprinted = sprintf(numberresult, fmt,
1587 va_arg(count, unsigned PY_LONG_LONG));
1588#endif
1589 else if (size_tflag)
1590 numprinted = sprintf(numberresult, fmt,
1591 va_arg(count, size_t));
1592 else
1593 numprinted = sprintf(numberresult, fmt,
1594 va_arg(count, unsigned int));
1595 n += numprinted;
1596 numberresult += (numprinted + 1);
1597 assert(*(numberresult - 1) == '\0');
1598 assert(*(numberresult - 2) != '\0');
1599 assert(numprinted >= 0);
1600 assert(numberresult <= numberresults + numbersize);
1601 break;
1602 case 'x':
1603 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1604 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1605 n += numprinted;
1606 numberresult += (numprinted + 1);
1607 assert(*(numberresult - 1) == '\0');
1608 assert(*(numberresult - 2) != '\0');
1609 assert(numprinted >= 0);
1610 assert(numberresult <= numberresults + numbersize);
1611 break;
1612 case 'p':
1613 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1614 /* %p is ill-defined: ensure leading 0x. */
1615 if (numberresult[1] == 'X')
1616 numberresult[1] = 'x';
1617 else if (numberresult[1] != 'x') {
1618 memmove(numberresult + 2, numberresult,
1619 strlen(numberresult) + 1);
1620 numberresult[0] = '0';
1621 numberresult[1] = 'x';
1622 numprinted += 2;
1623 }
1624 n += numprinted;
1625 numberresult += (numprinted + 1);
1626 assert(*(numberresult - 1) == '\0');
1627 assert(*(numberresult - 2) != '\0');
1628 assert(numprinted >= 0);
1629 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001630 break;
1631 case 's':
1632 {
1633 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001634 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001635 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1636 if (!str)
1637 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638 /* since PyUnicode_DecodeUTF8 returns already flexible
1639 unicode objects, there is no need to call ready on them */
1640 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001641 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001643 /* Remember the str and switch to the next slot */
1644 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001645 break;
1646 }
1647 case 'U':
1648 {
1649 PyObject *obj = va_arg(count, PyObject *);
1650 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 if (PyUnicode_READY(obj) == -1)
1652 goto fail;
1653 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001654 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001656 break;
1657 }
1658 case 'V':
1659 {
1660 PyObject *obj = va_arg(count, PyObject *);
1661 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001662 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001663 assert(obj || str);
1664 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001665 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 if (PyUnicode_READY(obj) == -1)
1667 goto fail;
1668 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001669 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001671 *callresult++ = NULL;
1672 }
1673 else {
1674 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1675 if (!str_obj)
1676 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001678 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001680 *callresult++ = str_obj;
1681 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001682 break;
1683 }
1684 case 'S':
1685 {
1686 PyObject *obj = va_arg(count, PyObject *);
1687 PyObject *str;
1688 assert(obj);
1689 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001691 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001692 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001693 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001695 /* Remember the str and switch to the next slot */
1696 *callresult++ = str;
1697 break;
1698 }
1699 case 'R':
1700 {
1701 PyObject *obj = va_arg(count, PyObject *);
1702 PyObject *repr;
1703 assert(obj);
1704 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001706 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001707 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001708 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001710 /* Remember the repr and switch to the next slot */
1711 *callresult++ = repr;
1712 break;
1713 }
1714 case 'A':
1715 {
1716 PyObject *obj = va_arg(count, PyObject *);
1717 PyObject *ascii;
1718 assert(obj);
1719 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001721 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001723 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001725 /* Remember the repr and switch to the next slot */
1726 *callresult++ = ascii;
1727 break;
1728 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001729 default:
1730 /* if we stumble upon an unknown
1731 formatting code, copy the rest of
1732 the format string to the output
1733 string. (we cannot just skip the
1734 code, since there's no way to know
1735 what's in the argument list) */
1736 n += strlen(p);
1737 goto expand;
1738 }
1739 } else
1740 n++;
1741 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001742 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001743 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001745 we don't have to resize the string.
1746 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001747 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001748 if (!string)
1749 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 kind = PyUnicode_KIND(string);
1751 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001752 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001756 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001757 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001758
1759 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1761 /* checking for == because the last argument could be a empty
1762 string, which causes i to point to end, the assert at the end of
1763 the loop */
1764 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001765
Benjamin Peterson14339b62009-01-31 16:36:08 +00001766 switch (*f) {
1767 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001768 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 const int ordinal = va_arg(vargs, int);
1770 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001771 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001772 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001773 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001774 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001775 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001776 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 case 'p':
1778 /* unused, since we already have the result */
1779 if (*f == 'p')
1780 (void) va_arg(vargs, void *);
1781 else
1782 (void) va_arg(vargs, int);
1783 /* extract the result from numberresults and append. */
1784 for (; *numberresult; ++i, ++numberresult)
1785 PyUnicode_WRITE(kind, data, i, *numberresult);
1786 /* skip over the separating '\0' */
1787 assert(*numberresult == '\0');
1788 numberresult++;
1789 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001790 break;
1791 case 's':
1792 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001793 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001795 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 size = PyUnicode_GET_LENGTH(*callresult);
1797 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001798 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1799 *callresult, 0,
1800 size) < 0)
1801 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001803 /* We're done with the unicode()/repr() => forget it */
1804 Py_DECREF(*callresult);
1805 /* switch to next unicode()/repr() result */
1806 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001807 break;
1808 }
1809 case 'U':
1810 {
1811 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 Py_ssize_t size;
1813 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1814 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001815 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1816 obj, 0,
1817 size) < 0)
1818 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001820 break;
1821 }
1822 case 'V':
1823 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001825 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001826 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001827 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 size = PyUnicode_GET_LENGTH(obj);
1829 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001830 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1831 obj, 0,
1832 size) < 0)
1833 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001835 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836 size = PyUnicode_GET_LENGTH(*callresult);
1837 assert(PyUnicode_KIND(*callresult) <=
1838 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001839 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1840 *callresult,
1841 0, size) < 0)
1842 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001844 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001845 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001846 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001847 break;
1848 }
1849 case 'S':
1850 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001851 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001852 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001853 /* unused, since we already have the result */
1854 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001856 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1857 *callresult, 0,
1858 PyUnicode_GET_LENGTH(*callresult)) < 0)
1859 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001861 /* We're done with the unicode()/repr() => forget it */
1862 Py_DECREF(*callresult);
1863 /* switch to next unicode()/repr() result */
1864 ++callresult;
1865 break;
1866 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001867 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001869 break;
1870 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871 for (; *p; ++p, ++i)
1872 PyUnicode_WRITE(kind, data, i, *p);
1873 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001874 goto end;
1875 }
Victor Stinner1205f272010-09-11 00:54:47 +00001876 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001877 else {
1878 assert(i < PyUnicode_GET_LENGTH(string));
1879 PyUnicode_WRITE(kind, data, i++, *f);
1880 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001881 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001882 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001883
Benjamin Peterson29060642009-01-31 22:14:21 +00001884 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001885 if (callresults)
1886 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001887 if (numberresults)
1888 PyObject_Free(numberresults);
1889 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001890 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001891 if (callresults) {
1892 PyObject **callresult2 = callresults;
1893 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001894 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001895 ++callresult2;
1896 }
1897 PyObject_Free(callresults);
1898 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001899 if (numberresults)
1900 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001901 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001902}
1903
Walter Dörwaldd2034312007-05-18 16:29:38 +00001904PyObject *
1905PyUnicode_FromFormat(const char *format, ...)
1906{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001907 PyObject* ret;
1908 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001909
1910#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001911 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001912#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001913 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001914#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001915 ret = PyUnicode_FromFormatV(format, vargs);
1916 va_end(vargs);
1917 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001918}
1919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920#ifdef HAVE_WCHAR_H
1921
Victor Stinner5593d8a2010-10-02 11:11:27 +00001922/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1923 convert a Unicode object to a wide character string.
1924
Victor Stinnerd88d9832011-09-06 02:00:05 +02001925 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001926 character) required to convert the unicode object. Ignore size argument.
1927
Victor Stinnerd88d9832011-09-06 02:00:05 +02001928 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001929 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001930 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001931static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001932unicode_aswidechar(PyUnicodeObject *unicode,
1933 wchar_t *w,
1934 Py_ssize_t size)
1935{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001936 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001937 const wchar_t *wstr;
1938
1939 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1940 if (wstr == NULL)
1941 return -1;
1942
Victor Stinner5593d8a2010-10-02 11:11:27 +00001943 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001944 if (size > res)
1945 size = res + 1;
1946 else
1947 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001949 return res;
1950 }
1951 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001953}
1954
1955Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001956PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001957 wchar_t *w,
1958 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959{
1960 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001961 PyErr_BadInternalCall();
1962 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001964 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965}
1966
Victor Stinner137c34c2010-09-29 10:25:54 +00001967wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001968PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001969 Py_ssize_t *size)
1970{
1971 wchar_t* buffer;
1972 Py_ssize_t buflen;
1973
1974 if (unicode == NULL) {
1975 PyErr_BadInternalCall();
1976 return NULL;
1977 }
1978
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001979 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980 if (buflen == -1)
1981 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001982 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001983 PyErr_NoMemory();
1984 return NULL;
1985 }
1986
Victor Stinner137c34c2010-09-29 10:25:54 +00001987 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1988 if (buffer == NULL) {
1989 PyErr_NoMemory();
1990 return NULL;
1991 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001992 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 if (buflen == -1)
1994 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001995 if (size != NULL)
1996 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001997 return buffer;
1998}
1999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
Alexander Belopolsky40018472011-02-26 01:02:56 +00002002PyObject *
2003PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002004{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002006 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002007 PyErr_SetString(PyExc_ValueError,
2008 "chr() arg not in range(0x110000)");
2009 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002010 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 if (ordinal < 256)
2013 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002015 v = PyUnicode_New(1, ordinal);
2016 if (v == NULL)
2017 return NULL;
2018 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2019 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002020}
2021
Alexander Belopolsky40018472011-02-26 01:02:56 +00002022PyObject *
2023PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002025 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002026 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002027 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002028 Py_INCREF(obj);
2029 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002030 }
2031 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002032 /* For a Unicode subtype that's not a Unicode object,
2033 return a true Unicode object with the same data. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 if (PyUnicode_READY(obj) == -1)
2035 return NULL;
2036 return substring((PyUnicodeObject *)obj, 0, PyUnicode_GET_LENGTH(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002037 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002038 PyErr_Format(PyExc_TypeError,
2039 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002040 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002041 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002042}
2043
Alexander Belopolsky40018472011-02-26 01:02:56 +00002044PyObject *
2045PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002046 const char *encoding,
2047 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002048{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002049 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002050 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002051
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002053 PyErr_BadInternalCall();
2054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002056
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002057 /* Decoding bytes objects is the most common case and should be fast */
2058 if (PyBytes_Check(obj)) {
2059 if (PyBytes_GET_SIZE(obj) == 0) {
2060 Py_INCREF(unicode_empty);
2061 v = (PyObject *) unicode_empty;
2062 }
2063 else {
2064 v = PyUnicode_Decode(
2065 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2066 encoding, errors);
2067 }
2068 return v;
2069 }
2070
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002071 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002072 PyErr_SetString(PyExc_TypeError,
2073 "decoding str is not supported");
2074 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002075 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002076
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002077 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2078 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2079 PyErr_Format(PyExc_TypeError,
2080 "coercing to str: need bytes, bytearray "
2081 "or buffer-like object, %.80s found",
2082 Py_TYPE(obj)->tp_name);
2083 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002084 }
Tim Petersced69f82003-09-16 20:30:58 +00002085
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002086 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002087 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002088 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089 }
Tim Petersced69f82003-09-16 20:30:58 +00002090 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002091 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002092
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002093 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002094 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002095}
2096
Victor Stinner600d3be2010-06-10 12:00:55 +00002097/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002098 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2099 1 on success. */
2100static int
2101normalize_encoding(const char *encoding,
2102 char *lower,
2103 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002105 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002106 char *l;
2107 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002108
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002109 e = encoding;
2110 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002111 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002112 while (*e) {
2113 if (l == l_end)
2114 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002115 if (Py_ISUPPER(*e)) {
2116 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002117 }
2118 else if (*e == '_') {
2119 *l++ = '-';
2120 e++;
2121 }
2122 else {
2123 *l++ = *e++;
2124 }
2125 }
2126 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002127 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002128}
2129
Alexander Belopolsky40018472011-02-26 01:02:56 +00002130PyObject *
2131PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002132 Py_ssize_t size,
2133 const char *encoding,
2134 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002135{
2136 PyObject *buffer = NULL, *unicode;
2137 Py_buffer info;
2138 char lower[11]; /* Enough for any encoding shortcut */
2139
2140 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002141 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002142
2143 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002144 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002145 if ((strcmp(lower, "utf-8") == 0) ||
2146 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002147 return PyUnicode_DecodeUTF8(s, size, errors);
2148 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002149 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002150 (strcmp(lower, "iso-8859-1") == 0))
2151 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002152#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002153 else if (strcmp(lower, "mbcs") == 0)
2154 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002155#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002156 else if (strcmp(lower, "ascii") == 0)
2157 return PyUnicode_DecodeASCII(s, size, errors);
2158 else if (strcmp(lower, "utf-16") == 0)
2159 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2160 else if (strcmp(lower, "utf-32") == 0)
2161 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2162 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163
2164 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002165 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002166 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002167 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002168 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 if (buffer == NULL)
2170 goto onError;
2171 unicode = PyCodec_Decode(buffer, encoding, errors);
2172 if (unicode == NULL)
2173 goto onError;
2174 if (!PyUnicode_Check(unicode)) {
2175 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002176 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002177 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178 Py_DECREF(unicode);
2179 goto onError;
2180 }
2181 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 if (PyUnicode_READY(unicode)) {
2183 Py_DECREF(unicode);
2184 return NULL;
2185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002187
Benjamin Peterson29060642009-01-31 22:14:21 +00002188 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189 Py_XDECREF(buffer);
2190 return NULL;
2191}
2192
Alexander Belopolsky40018472011-02-26 01:02:56 +00002193PyObject *
2194PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002195 const char *encoding,
2196 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002197{
2198 PyObject *v;
2199
2200 if (!PyUnicode_Check(unicode)) {
2201 PyErr_BadArgument();
2202 goto onError;
2203 }
2204
2205 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002206 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002207
2208 /* Decode via the codec registry */
2209 v = PyCodec_Decode(unicode, encoding, errors);
2210 if (v == NULL)
2211 goto onError;
2212 return v;
2213
Benjamin Peterson29060642009-01-31 22:14:21 +00002214 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002215 return NULL;
2216}
2217
Alexander Belopolsky40018472011-02-26 01:02:56 +00002218PyObject *
2219PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002220 const char *encoding,
2221 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002222{
2223 PyObject *v;
2224
2225 if (!PyUnicode_Check(unicode)) {
2226 PyErr_BadArgument();
2227 goto onError;
2228 }
2229
2230 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002231 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002232
2233 /* Decode via the codec registry */
2234 v = PyCodec_Decode(unicode, encoding, errors);
2235 if (v == NULL)
2236 goto onError;
2237 if (!PyUnicode_Check(v)) {
2238 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002239 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002240 Py_TYPE(v)->tp_name);
2241 Py_DECREF(v);
2242 goto onError;
2243 }
2244 return v;
2245
Benjamin Peterson29060642009-01-31 22:14:21 +00002246 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002247 return NULL;
2248}
2249
Alexander Belopolsky40018472011-02-26 01:02:56 +00002250PyObject *
2251PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002252 Py_ssize_t size,
2253 const char *encoding,
2254 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255{
2256 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002257
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 unicode = PyUnicode_FromUnicode(s, size);
2259 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2262 Py_DECREF(unicode);
2263 return v;
2264}
2265
Alexander Belopolsky40018472011-02-26 01:02:56 +00002266PyObject *
2267PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002268 const char *encoding,
2269 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002270{
2271 PyObject *v;
2272
2273 if (!PyUnicode_Check(unicode)) {
2274 PyErr_BadArgument();
2275 goto onError;
2276 }
2277
2278 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002279 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002280
2281 /* Encode via the codec registry */
2282 v = PyCodec_Encode(unicode, encoding, errors);
2283 if (v == NULL)
2284 goto onError;
2285 return v;
2286
Benjamin Peterson29060642009-01-31 22:14:21 +00002287 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002288 return NULL;
2289}
2290
Victor Stinnerad158722010-10-27 00:25:46 +00002291PyObject *
2292PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002293{
Victor Stinner99b95382011-07-04 14:23:54 +02002294#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002295 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2296 PyUnicode_GET_SIZE(unicode),
2297 NULL);
2298#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002299 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002300#else
Victor Stinner793b5312011-04-27 00:24:21 +02002301 PyInterpreterState *interp = PyThreadState_GET()->interp;
2302 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2303 cannot use it to encode and decode filenames before it is loaded. Load
2304 the Python codec requires to encode at least its own filename. Use the C
2305 version of the locale codec until the codec registry is initialized and
2306 the Python codec is loaded.
2307
2308 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2309 cannot only rely on it: check also interp->fscodec_initialized for
2310 subinterpreters. */
2311 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002312 return PyUnicode_AsEncodedString(unicode,
2313 Py_FileSystemDefaultEncoding,
2314 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002315 }
2316 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002317 /* locale encoding with surrogateescape */
2318 wchar_t *wchar;
2319 char *bytes;
2320 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002321 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002322
2323 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2324 if (wchar == NULL)
2325 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002326 bytes = _Py_wchar2char(wchar, &error_pos);
2327 if (bytes == NULL) {
2328 if (error_pos != (size_t)-1) {
2329 char *errmsg = strerror(errno);
2330 PyObject *exc = NULL;
2331 if (errmsg == NULL)
2332 errmsg = "Py_wchar2char() failed";
2333 raise_encode_exception(&exc,
2334 "filesystemencoding",
2335 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2336 error_pos, error_pos+1,
2337 errmsg);
2338 Py_XDECREF(exc);
2339 }
2340 else
2341 PyErr_NoMemory();
2342 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002343 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002344 }
2345 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002346
2347 bytes_obj = PyBytes_FromString(bytes);
2348 PyMem_Free(bytes);
2349 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002350 }
Victor Stinnerad158722010-10-27 00:25:46 +00002351#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002352}
2353
Alexander Belopolsky40018472011-02-26 01:02:56 +00002354PyObject *
2355PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002356 const char *encoding,
2357 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358{
2359 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002360 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002361
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362 if (!PyUnicode_Check(unicode)) {
2363 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002364 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365 }
Fred Drakee4315f52000-05-09 19:53:39 +00002366
Victor Stinner2f283c22011-03-02 01:21:46 +00002367 if (encoding == NULL) {
2368 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002370 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002372 }
Fred Drakee4315f52000-05-09 19:53:39 +00002373
2374 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002375 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002376 if ((strcmp(lower, "utf-8") == 0) ||
2377 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002378 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002379 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002380 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002381 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002383 }
Victor Stinner37296e82010-06-10 13:36:23 +00002384 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002385 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002386 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002388#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002389 else if (strcmp(lower, "mbcs") == 0)
2390 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2391 PyUnicode_GET_SIZE(unicode),
2392 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002393#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002394 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002396 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397
2398 /* Encode via the codec registry */
2399 v = PyCodec_Encode(unicode, encoding, errors);
2400 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002401 return NULL;
2402
2403 /* The normal path */
2404 if (PyBytes_Check(v))
2405 return v;
2406
2407 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002408 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002409 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002410 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002411
2412 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2413 "encoder %s returned bytearray instead of bytes",
2414 encoding);
2415 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002416 Py_DECREF(v);
2417 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002418 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002419
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002420 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2421 Py_DECREF(v);
2422 return b;
2423 }
2424
2425 PyErr_Format(PyExc_TypeError,
2426 "encoder did not return a bytes object (type=%.400s)",
2427 Py_TYPE(v)->tp_name);
2428 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002429 return NULL;
2430}
2431
Alexander Belopolsky40018472011-02-26 01:02:56 +00002432PyObject *
2433PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002434 const char *encoding,
2435 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002436{
2437 PyObject *v;
2438
2439 if (!PyUnicode_Check(unicode)) {
2440 PyErr_BadArgument();
2441 goto onError;
2442 }
2443
2444 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002445 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002446
2447 /* Encode via the codec registry */
2448 v = PyCodec_Encode(unicode, encoding, errors);
2449 if (v == NULL)
2450 goto onError;
2451 if (!PyUnicode_Check(v)) {
2452 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002453 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002454 Py_TYPE(v)->tp_name);
2455 Py_DECREF(v);
2456 goto onError;
2457 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002459
Benjamin Peterson29060642009-01-31 22:14:21 +00002460 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461 return NULL;
2462}
2463
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002464PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002465PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002466 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002467 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2468}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002469
Christian Heimes5894ba72007-11-04 11:43:14 +00002470PyObject*
2471PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2472{
Victor Stinner99b95382011-07-04 14:23:54 +02002473#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002474 return PyUnicode_DecodeMBCS(s, size, NULL);
2475#elif defined(__APPLE__)
2476 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2477#else
Victor Stinner793b5312011-04-27 00:24:21 +02002478 PyInterpreterState *interp = PyThreadState_GET()->interp;
2479 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2480 cannot use it to encode and decode filenames before it is loaded. Load
2481 the Python codec requires to encode at least its own filename. Use the C
2482 version of the locale codec until the codec registry is initialized and
2483 the Python codec is loaded.
2484
2485 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2486 cannot only rely on it: check also interp->fscodec_initialized for
2487 subinterpreters. */
2488 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002489 return PyUnicode_Decode(s, size,
2490 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002491 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002492 }
2493 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002494 /* locale encoding with surrogateescape */
2495 wchar_t *wchar;
2496 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002497 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002498
2499 if (s[size] != '\0' || size != strlen(s)) {
2500 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2501 return NULL;
2502 }
2503
Victor Stinner168e1172010-10-16 23:16:16 +00002504 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002505 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002506 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002507
Victor Stinner168e1172010-10-16 23:16:16 +00002508 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002509 PyMem_Free(wchar);
2510 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002511 }
Victor Stinnerad158722010-10-27 00:25:46 +00002512#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002513}
2514
Martin v. Löwis011e8422009-05-05 04:43:17 +00002515
2516int
2517PyUnicode_FSConverter(PyObject* arg, void* addr)
2518{
2519 PyObject *output = NULL;
2520 Py_ssize_t size;
2521 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002522 if (arg == NULL) {
2523 Py_DECREF(*(PyObject**)addr);
2524 return 1;
2525 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002526 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002527 output = arg;
2528 Py_INCREF(output);
2529 }
2530 else {
2531 arg = PyUnicode_FromObject(arg);
2532 if (!arg)
2533 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002534 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002535 Py_DECREF(arg);
2536 if (!output)
2537 return 0;
2538 if (!PyBytes_Check(output)) {
2539 Py_DECREF(output);
2540 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2541 return 0;
2542 }
2543 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002544 size = PyBytes_GET_SIZE(output);
2545 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002546 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002547 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002548 Py_DECREF(output);
2549 return 0;
2550 }
2551 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002552 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002553}
2554
2555
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002556int
2557PyUnicode_FSDecoder(PyObject* arg, void* addr)
2558{
2559 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002560 if (arg == NULL) {
2561 Py_DECREF(*(PyObject**)addr);
2562 return 1;
2563 }
2564 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 if (PyUnicode_READY(arg))
2566 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002567 output = arg;
2568 Py_INCREF(output);
2569 }
2570 else {
2571 arg = PyBytes_FromObject(arg);
2572 if (!arg)
2573 return 0;
2574 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2575 PyBytes_GET_SIZE(arg));
2576 Py_DECREF(arg);
2577 if (!output)
2578 return 0;
2579 if (!PyUnicode_Check(output)) {
2580 Py_DECREF(output);
2581 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2582 return 0;
2583 }
2584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002585 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2586 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002587 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2588 Py_DECREF(output);
2589 return 0;
2590 }
2591 *(PyObject**)addr = output;
2592 return Py_CLEANUP_SUPPORTED;
2593}
2594
2595
Martin v. Löwis5b222132007-06-10 09:51:05 +00002596char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002598{
Christian Heimesf3863112007-11-22 07:46:41 +00002599 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2601
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002602 if (!PyUnicode_Check(unicode)) {
2603 PyErr_BadArgument();
2604 return NULL;
2605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002606 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002607 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608
2609 if (_PyUnicode_UTF8(unicode) == NULL) {
2610 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2611 if (bytes == NULL)
2612 return NULL;
2613 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2614 if (u->_base.utf8 == NULL) {
2615 Py_DECREF(bytes);
2616 return NULL;
2617 }
2618 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2619 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2620 Py_DECREF(bytes);
2621 }
2622
2623 if (psize)
2624 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2625 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002626}
2627
2628char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002629PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002630{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2632}
2633
2634#ifdef Py_DEBUG
2635int unicode_as_unicode_calls = 0;
2636#endif
2637
2638
2639Py_UNICODE *
2640PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2641{
2642 PyUnicodeObject *u;
2643 const unsigned char *one_byte;
2644#if SIZEOF_WCHAR_T == 4
2645 const Py_UCS2 *two_bytes;
2646#else
2647 const Py_UCS4 *four_bytes;
2648 const Py_UCS4 *ucs4_end;
2649 Py_ssize_t num_surrogates;
2650#endif
2651 wchar_t *w;
2652 wchar_t *wchar_end;
2653
2654 if (!PyUnicode_Check(unicode)) {
2655 PyErr_BadArgument();
2656 return NULL;
2657 }
2658 u = (PyUnicodeObject*)unicode;
2659 if (_PyUnicode_WSTR(u) == NULL) {
2660 /* Non-ASCII compact unicode object */
2661 assert(_PyUnicode_KIND(u) != 0);
2662 assert(PyUnicode_IS_READY(u));
2663
2664#ifdef Py_DEBUG
2665 ++unicode_as_unicode_calls;
2666#endif
2667
2668 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2669#if SIZEOF_WCHAR_T == 2
2670 four_bytes = PyUnicode_4BYTE_DATA(u);
2671 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2672 num_surrogates = 0;
2673
2674 for (; four_bytes < ucs4_end; ++four_bytes) {
2675 if (*four_bytes > 0xFFFF)
2676 ++num_surrogates;
2677 }
2678
2679 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2680 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2681 if (!_PyUnicode_WSTR(u)) {
2682 PyErr_NoMemory();
2683 return NULL;
2684 }
2685 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2686
2687 w = _PyUnicode_WSTR(u);
2688 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2689 four_bytes = PyUnicode_4BYTE_DATA(u);
2690 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2691 if (*four_bytes > 0xFFFF) {
2692 /* encode surrogate pair in this case */
2693 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2694 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2695 }
2696 else
2697 *w = *four_bytes;
2698
2699 if (w > wchar_end) {
2700 assert(0 && "Miscalculated string end");
2701 }
2702 }
2703 *w = 0;
2704#else
2705 /* sizeof(wchar_t) == 4 */
2706 Py_FatalError("Impossible unicode object state, wstr and str "
2707 "should share memory already.");
2708 return NULL;
2709#endif
2710 }
2711 else {
2712 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2713 (_PyUnicode_LENGTH(u) + 1));
2714 if (!_PyUnicode_WSTR(u)) {
2715 PyErr_NoMemory();
2716 return NULL;
2717 }
2718 if (!PyUnicode_IS_COMPACT_ASCII(u))
2719 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2720 w = _PyUnicode_WSTR(u);
2721 wchar_end = w + _PyUnicode_LENGTH(u);
2722
2723 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2724 one_byte = PyUnicode_1BYTE_DATA(u);
2725 for (; w < wchar_end; ++one_byte, ++w)
2726 *w = *one_byte;
2727 /* null-terminate the wstr */
2728 *w = 0;
2729 }
2730 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2731#if SIZEOF_WCHAR_T == 4
2732 two_bytes = PyUnicode_2BYTE_DATA(u);
2733 for (; w < wchar_end; ++two_bytes, ++w)
2734 *w = *two_bytes;
2735 /* null-terminate the wstr */
2736 *w = 0;
2737#else
2738 /* sizeof(wchar_t) == 2 */
2739 PyObject_FREE(_PyUnicode_WSTR(u));
2740 _PyUnicode_WSTR(u) = NULL;
2741 Py_FatalError("Impossible unicode object state, wstr "
2742 "and str should share memory already.");
2743 return NULL;
2744#endif
2745 }
2746 else {
2747 assert(0 && "This should never happen.");
2748 }
2749 }
2750 }
2751 if (size != NULL)
2752 *size = PyUnicode_WSTR_LENGTH(u);
2753 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002754}
2755
Alexander Belopolsky40018472011-02-26 01:02:56 +00002756Py_UNICODE *
2757PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002759 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760}
2761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002762
Alexander Belopolsky40018472011-02-26 01:02:56 +00002763Py_ssize_t
2764PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765{
2766 if (!PyUnicode_Check(unicode)) {
2767 PyErr_BadArgument();
2768 goto onError;
2769 }
2770 return PyUnicode_GET_SIZE(unicode);
2771
Benjamin Peterson29060642009-01-31 22:14:21 +00002772 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 return -1;
2774}
2775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002776Py_ssize_t
2777PyUnicode_GetLength(PyObject *unicode)
2778{
2779 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2780 PyErr_BadArgument();
2781 return -1;
2782 }
2783
2784 return PyUnicode_GET_LENGTH(unicode);
2785}
2786
2787Py_UCS4
2788PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2789{
2790 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2791 return PyErr_BadArgument();
2792 return (Py_UCS4)-1;
2793 }
2794 return PyUnicode_READ_CHAR(unicode, index);
2795}
2796
2797int
2798PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2799{
2800 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2801 return PyErr_BadArgument();
2802 return -1;
2803 }
2804
2805 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2806 index, ch);
2807 return 0;
2808}
2809
Alexander Belopolsky40018472011-02-26 01:02:56 +00002810const char *
2811PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002812{
Victor Stinner42cb4622010-09-01 19:39:01 +00002813 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002814}
2815
Victor Stinner554f3f02010-06-16 23:33:54 +00002816/* create or adjust a UnicodeDecodeError */
2817static void
2818make_decode_exception(PyObject **exceptionObject,
2819 const char *encoding,
2820 const char *input, Py_ssize_t length,
2821 Py_ssize_t startpos, Py_ssize_t endpos,
2822 const char *reason)
2823{
2824 if (*exceptionObject == NULL) {
2825 *exceptionObject = PyUnicodeDecodeError_Create(
2826 encoding, input, length, startpos, endpos, reason);
2827 }
2828 else {
2829 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2830 goto onError;
2831 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2832 goto onError;
2833 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2834 goto onError;
2835 }
2836 return;
2837
2838onError:
2839 Py_DECREF(*exceptionObject);
2840 *exceptionObject = NULL;
2841}
2842
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002843/* error handling callback helper:
2844 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002845 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846 and adjust various state variables.
2847 return 0 on success, -1 on error
2848*/
2849
Alexander Belopolsky40018472011-02-26 01:02:56 +00002850static int
2851unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002852 const char *encoding, const char *reason,
2853 const char **input, const char **inend, Py_ssize_t *startinpos,
2854 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2855 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002856{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002857 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002858
2859 PyObject *restuple = NULL;
2860 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002861 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002862 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002863 Py_ssize_t requiredsize;
2864 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002865 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002866 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002867 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002868 int res = -1;
2869
2870 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002871 *errorHandler = PyCodec_LookupError(errors);
2872 if (*errorHandler == NULL)
2873 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002874 }
2875
Victor Stinner554f3f02010-06-16 23:33:54 +00002876 make_decode_exception(exceptionObject,
2877 encoding,
2878 *input, *inend - *input,
2879 *startinpos, *endinpos,
2880 reason);
2881 if (*exceptionObject == NULL)
2882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002883
2884 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2885 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002886 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002888 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002889 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002890 }
2891 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002892 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002893
2894 /* Copy back the bytes variables, which might have been modified by the
2895 callback */
2896 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2897 if (!inputobj)
2898 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002899 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002900 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002901 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002902 *input = PyBytes_AS_STRING(inputobj);
2903 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002904 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002905 /* we can DECREF safely, as the exception has another reference,
2906 so the object won't go away. */
2907 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002908
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002909 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002910 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002911 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002912 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2913 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002914 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002915
2916 /* need more space? (at least enough for what we
2917 have+the replacement+the rest of the string (starting
2918 at the new input position), so we won't have to check space
2919 when there are no errors in the rest of the string) */
2920 repptr = PyUnicode_AS_UNICODE(repunicode);
2921 repsize = PyUnicode_GET_SIZE(repunicode);
2922 requiredsize = *outpos + repsize + insize-newpos;
2923 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002924 if (requiredsize<2*outsize)
2925 requiredsize = 2*outsize;
2926 if (_PyUnicode_Resize(output, requiredsize) < 0)
2927 goto onError;
2928 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002929 }
2930 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002931 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002932 Py_UNICODE_COPY(*outptr, repptr, repsize);
2933 *outptr += repsize;
2934 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002935
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002936 /* we made it! */
2937 res = 0;
2938
Benjamin Peterson29060642009-01-31 22:14:21 +00002939 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940 Py_XDECREF(restuple);
2941 return res;
2942}
2943
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002944/* --- UTF-7 Codec -------------------------------------------------------- */
2945
Antoine Pitrou244651a2009-05-04 18:56:13 +00002946/* See RFC2152 for details. We encode conservatively and decode liberally. */
2947
2948/* Three simple macros defining base-64. */
2949
2950/* Is c a base-64 character? */
2951
2952#define IS_BASE64(c) \
2953 (((c) >= 'A' && (c) <= 'Z') || \
2954 ((c) >= 'a' && (c) <= 'z') || \
2955 ((c) >= '0' && (c) <= '9') || \
2956 (c) == '+' || (c) == '/')
2957
2958/* given that c is a base-64 character, what is its base-64 value? */
2959
2960#define FROM_BASE64(c) \
2961 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2962 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2963 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2964 (c) == '+' ? 62 : 63)
2965
2966/* What is the base-64 character of the bottom 6 bits of n? */
2967
2968#define TO_BASE64(n) \
2969 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2970
2971/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2972 * decoded as itself. We are permissive on decoding; the only ASCII
2973 * byte not decoding to itself is the + which begins a base64
2974 * string. */
2975
2976#define DECODE_DIRECT(c) \
2977 ((c) <= 127 && (c) != '+')
2978
2979/* The UTF-7 encoder treats ASCII characters differently according to
2980 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2981 * the above). See RFC2152. This array identifies these different
2982 * sets:
2983 * 0 : "Set D"
2984 * alphanumeric and '(),-./:?
2985 * 1 : "Set O"
2986 * !"#$%&*;<=>@[]^_`{|}
2987 * 2 : "whitespace"
2988 * ht nl cr sp
2989 * 3 : special (must be base64 encoded)
2990 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2991 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002992
Tim Petersced69f82003-09-16 20:30:58 +00002993static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002994char utf7_category[128] = {
2995/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2996 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2997/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2998 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2999/* sp ! " # $ % & ' ( ) * + , - . / */
3000 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3001/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3002 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3003/* @ A B C D E F G H I J K L M N O */
3004 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3005/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3006 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3007/* ` a b c d e f g h i j k l m n o */
3008 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3009/* p q r s t u v w x y z { | } ~ del */
3010 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003011};
3012
Antoine Pitrou244651a2009-05-04 18:56:13 +00003013/* ENCODE_DIRECT: this character should be encoded as itself. The
3014 * answer depends on whether we are encoding set O as itself, and also
3015 * on whether we are encoding whitespace as itself. RFC2152 makes it
3016 * clear that the answers to these questions vary between
3017 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003018
Antoine Pitrou244651a2009-05-04 18:56:13 +00003019#define ENCODE_DIRECT(c, directO, directWS) \
3020 ((c) < 128 && (c) > 0 && \
3021 ((utf7_category[(c)] == 0) || \
3022 (directWS && (utf7_category[(c)] == 2)) || \
3023 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003024
Alexander Belopolsky40018472011-02-26 01:02:56 +00003025PyObject *
3026PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003027 Py_ssize_t size,
3028 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003029{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003030 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3031}
3032
Antoine Pitrou244651a2009-05-04 18:56:13 +00003033/* The decoder. The only state we preserve is our read position,
3034 * i.e. how many characters we have consumed. So if we end in the
3035 * middle of a shift sequence we have to back off the read position
3036 * and the output to the beginning of the sequence, otherwise we lose
3037 * all the shift state (seen bits, number of bits seen, high
3038 * surrogate). */
3039
Alexander Belopolsky40018472011-02-26 01:02:56 +00003040PyObject *
3041PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003042 Py_ssize_t size,
3043 const char *errors,
3044 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003045{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003046 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003047 Py_ssize_t startinpos;
3048 Py_ssize_t endinpos;
3049 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003050 const char *e;
3051 PyUnicodeObject *unicode;
3052 Py_UNICODE *p;
3053 const char *errmsg = "";
3054 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003055 Py_UNICODE *shiftOutStart;
3056 unsigned int base64bits = 0;
3057 unsigned long base64buffer = 0;
3058 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 PyObject *errorHandler = NULL;
3060 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003061
3062 unicode = _PyUnicode_New(size);
3063 if (!unicode)
3064 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003065 if (size == 0) {
3066 if (consumed)
3067 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003068 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003069 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003071 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003072 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003073 e = s + size;
3074
3075 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003078 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003079
Antoine Pitrou244651a2009-05-04 18:56:13 +00003080 if (inShift) { /* in a base-64 section */
3081 if (IS_BASE64(ch)) { /* consume a base-64 character */
3082 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3083 base64bits += 6;
3084 s++;
3085 if (base64bits >= 16) {
3086 /* we have enough bits for a UTF-16 value */
3087 Py_UNICODE outCh = (Py_UNICODE)
3088 (base64buffer >> (base64bits-16));
3089 base64bits -= 16;
3090 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3091 if (surrogate) {
3092 /* expecting a second surrogate */
3093 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3094#ifdef Py_UNICODE_WIDE
3095 *p++ = (((surrogate & 0x3FF)<<10)
3096 | (outCh & 0x3FF)) + 0x10000;
3097#else
3098 *p++ = surrogate;
3099 *p++ = outCh;
3100#endif
3101 surrogate = 0;
3102 }
3103 else {
3104 surrogate = 0;
3105 errmsg = "second surrogate missing";
3106 goto utf7Error;
3107 }
3108 }
3109 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3110 /* first surrogate */
3111 surrogate = outCh;
3112 }
3113 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3114 errmsg = "unexpected second surrogate";
3115 goto utf7Error;
3116 }
3117 else {
3118 *p++ = outCh;
3119 }
3120 }
3121 }
3122 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003123 inShift = 0;
3124 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003125 if (surrogate) {
3126 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003127 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003128 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003129 if (base64bits > 0) { /* left-over bits */
3130 if (base64bits >= 6) {
3131 /* We've seen at least one base-64 character */
3132 errmsg = "partial character in shift sequence";
3133 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003134 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003135 else {
3136 /* Some bits remain; they should be zero */
3137 if (base64buffer != 0) {
3138 errmsg = "non-zero padding bits in shift sequence";
3139 goto utf7Error;
3140 }
3141 }
3142 }
3143 if (ch != '-') {
3144 /* '-' is absorbed; other terminating
3145 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003146 *p++ = ch;
3147 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003148 }
3149 }
3150 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003152 s++; /* consume '+' */
3153 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003154 s++;
3155 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003156 }
3157 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003158 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003159 shiftOutStart = p;
3160 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003161 }
3162 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003163 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003164 *p++ = ch;
3165 s++;
3166 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003167 else {
3168 startinpos = s-starts;
3169 s++;
3170 errmsg = "unexpected special character";
3171 goto utf7Error;
3172 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003173 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003174utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003175 outpos = p-PyUnicode_AS_UNICODE(unicode);
3176 endinpos = s-starts;
3177 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003178 errors, &errorHandler,
3179 "utf7", errmsg,
3180 &starts, &e, &startinpos, &endinpos, &exc, &s,
3181 &unicode, &outpos, &p))
3182 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003183 }
3184
Antoine Pitrou244651a2009-05-04 18:56:13 +00003185 /* end of string */
3186
3187 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3188 /* if we're in an inconsistent state, that's an error */
3189 if (surrogate ||
3190 (base64bits >= 6) ||
3191 (base64bits > 0 && base64buffer != 0)) {
3192 outpos = p-PyUnicode_AS_UNICODE(unicode);
3193 endinpos = size;
3194 if (unicode_decode_call_errorhandler(
3195 errors, &errorHandler,
3196 "utf7", "unterminated shift sequence",
3197 &starts, &e, &startinpos, &endinpos, &exc, &s,
3198 &unicode, &outpos, &p))
3199 goto onError;
3200 if (s < e)
3201 goto restart;
3202 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003203 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003204
3205 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003206 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003207 if (inShift) {
3208 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003209 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003210 }
3211 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003212 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003213 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003214 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003215
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003216 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003217 goto onError;
3218
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003219 Py_XDECREF(errorHandler);
3220 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003221 if (PyUnicode_READY(unicode) == -1) {
3222 Py_DECREF(unicode);
3223 return NULL;
3224 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003225 return (PyObject *)unicode;
3226
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003228 Py_XDECREF(errorHandler);
3229 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003230 Py_DECREF(unicode);
3231 return NULL;
3232}
3233
3234
Alexander Belopolsky40018472011-02-26 01:02:56 +00003235PyObject *
3236PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003237 Py_ssize_t size,
3238 int base64SetO,
3239 int base64WhiteSpace,
3240 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003241{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003242 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003243 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003244 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003245 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003246 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003247 unsigned int base64bits = 0;
3248 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003249 char * out;
3250 char * start;
3251
3252 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003253 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003254
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003255 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003256 return PyErr_NoMemory();
3257
Antoine Pitrou244651a2009-05-04 18:56:13 +00003258 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003259 if (v == NULL)
3260 return NULL;
3261
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003262 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003263 for (;i < size; ++i) {
3264 Py_UNICODE ch = s[i];
3265
Antoine Pitrou244651a2009-05-04 18:56:13 +00003266 if (inShift) {
3267 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3268 /* shifting out */
3269 if (base64bits) { /* output remaining bits */
3270 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3271 base64buffer = 0;
3272 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003273 }
3274 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003275 /* Characters not in the BASE64 set implicitly unshift the sequence
3276 so no '-' is required, except if the character is itself a '-' */
3277 if (IS_BASE64(ch) || ch == '-') {
3278 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003279 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003280 *out++ = (char) ch;
3281 }
3282 else {
3283 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003284 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003285 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003286 else { /* not in a shift sequence */
3287 if (ch == '+') {
3288 *out++ = '+';
3289 *out++ = '-';
3290 }
3291 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3292 *out++ = (char) ch;
3293 }
3294 else {
3295 *out++ = '+';
3296 inShift = 1;
3297 goto encode_char;
3298 }
3299 }
3300 continue;
3301encode_char:
3302#ifdef Py_UNICODE_WIDE
3303 if (ch >= 0x10000) {
3304 /* code first surrogate */
3305 base64bits += 16;
3306 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3307 while (base64bits >= 6) {
3308 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3309 base64bits -= 6;
3310 }
3311 /* prepare second surrogate */
3312 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3313 }
3314#endif
3315 base64bits += 16;
3316 base64buffer = (base64buffer << 16) | ch;
3317 while (base64bits >= 6) {
3318 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3319 base64bits -= 6;
3320 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003321 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003322 if (base64bits)
3323 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3324 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003325 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003326 if (_PyBytes_Resize(&v, out - start) < 0)
3327 return NULL;
3328 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003329}
3330
Antoine Pitrou244651a2009-05-04 18:56:13 +00003331#undef IS_BASE64
3332#undef FROM_BASE64
3333#undef TO_BASE64
3334#undef DECODE_DIRECT
3335#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003336
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337/* --- UTF-8 Codec -------------------------------------------------------- */
3338
Tim Petersced69f82003-09-16 20:30:58 +00003339static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003341 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3342 illegal prefix. See RFC 3629 for details */
3343 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3344 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003345 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3347 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3348 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3349 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003350 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3351 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3353 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003354 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3355 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3356 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3357 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3358 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359};
3360
Alexander Belopolsky40018472011-02-26 01:02:56 +00003361PyObject *
3362PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003363 Py_ssize_t size,
3364 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365{
Walter Dörwald69652032004-09-07 20:24:22 +00003366 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3367}
3368
Antoine Pitrouab868312009-01-10 15:40:25 +00003369/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3370#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3371
3372/* Mask to quickly check whether a C 'long' contains a
3373 non-ASCII, UTF8-encoded char. */
3374#if (SIZEOF_LONG == 8)
3375# define ASCII_CHAR_MASK 0x8080808080808080L
3376#elif (SIZEOF_LONG == 4)
3377# define ASCII_CHAR_MASK 0x80808080L
3378#else
3379# error C 'long' size should be either 4 or 8!
3380#endif
3381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003382/* Scans a UTF-8 string and returns the maximum character to be expected,
3383 the size of the decoded unicode string and if any major errors were
3384 encountered.
3385
3386 This function does check basic UTF-8 sanity, it does however NOT CHECK
3387 if the string contains surrogates, and if all continuation bytes are
3388 within the correct ranges, these checks are performed in
3389 PyUnicode_DecodeUTF8Stateful.
3390
3391 If it sets has_errors to 1, it means the value of unicode_size and max_char
3392 will be bogus and you should not rely on useful information in them.
3393 */
3394static Py_UCS4
3395utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3396 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3397 int *has_errors)
3398{
3399 Py_ssize_t n;
3400 Py_ssize_t char_count = 0;
3401 Py_UCS4 max_char = 127, new_max;
3402 Py_UCS4 upper_bound;
3403 const unsigned char *p = (const unsigned char *)s;
3404 const unsigned char *end = p + string_size;
3405 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3406 int err = 0;
3407
3408 for (; p < end && !err; ++p, ++char_count) {
3409 /* Only check value if it's not a ASCII char... */
3410 if (*p < 0x80) {
3411 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3412 an explanation. */
3413 if (!((size_t) p & LONG_PTR_MASK)) {
3414 /* Help register allocation */
3415 register const unsigned char *_p = p;
3416 while (_p < aligned_end) {
3417 unsigned long value = *(unsigned long *) _p;
3418 if (value & ASCII_CHAR_MASK)
3419 break;
3420 _p += SIZEOF_LONG;
3421 char_count += SIZEOF_LONG;
3422 }
3423 p = _p;
3424 if (p == end)
3425 break;
3426 }
3427 }
3428 if (*p >= 0x80) {
3429 n = utf8_code_length[*p];
3430 new_max = max_char;
3431 switch (n) {
3432 /* invalid start byte */
3433 case 0:
3434 err = 1;
3435 break;
3436 case 2:
3437 /* Code points between 0x00FF and 0x07FF inclusive.
3438 Approximate the upper bound of the code point,
3439 if this flips over 255 we can be sure it will be more
3440 than 255 and the string will need 2 bytes per code coint,
3441 if it stays under or equal to 255, we can be sure 1 byte
3442 is enough.
3443 ((*p & 0b00011111) << 6) | 0b00111111 */
3444 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3445 if (max_char < upper_bound)
3446 new_max = upper_bound;
3447 /* Ensure we track at least that we left ASCII space. */
3448 if (new_max < 128)
3449 new_max = 128;
3450 break;
3451 case 3:
3452 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3453 always > 255 and <= 65535 and will always need 2 bytes. */
3454 if (max_char < 65535)
3455 new_max = 65535;
3456 break;
3457 case 4:
3458 /* Code point will be above 0xFFFF for sure in this case. */
3459 new_max = 65537;
3460 break;
3461 /* Internal error, this should be caught by the first if */
3462 case 1:
3463 default:
3464 assert(0 && "Impossible case in utf8_max_char_and_size");
3465 err = 1;
3466 }
3467 /* Instead of number of overall bytes for this code point,
3468 n containts the number of following bytes: */
3469 --n;
3470 /* Check if the follow up chars are all valid continuation bytes */
3471 if (n >= 1) {
3472 const unsigned char *cont;
3473 if ((p + n) >= end) {
3474 if (consumed == 0)
3475 /* incomplete data, non-incremental decoding */
3476 err = 1;
3477 break;
3478 }
3479 for (cont = p + 1; cont < (p + n); ++cont) {
3480 if ((*cont & 0xc0) != 0x80) {
3481 err = 1;
3482 break;
3483 }
3484 }
3485 p += n;
3486 }
3487 else
3488 err = 1;
3489 max_char = new_max;
3490 }
3491 }
3492
3493 if (unicode_size)
3494 *unicode_size = char_count;
3495 if (has_errors)
3496 *has_errors = err;
3497 return max_char;
3498}
3499
3500/* Similar to PyUnicode_WRITE but can also write into wstr field
3501 of the legacy unicode representation */
3502#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3503 do { \
3504 const int k_ = (kind); \
3505 if (k_ == PyUnicode_WCHAR_KIND) \
3506 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3507 else if (k_ == PyUnicode_1BYTE_KIND) \
3508 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3509 else if (k_ == PyUnicode_2BYTE_KIND) \
3510 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3511 else \
3512 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3513 } while (0)
3514
Alexander Belopolsky40018472011-02-26 01:02:56 +00003515PyObject *
3516PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003517 Py_ssize_t size,
3518 const char *errors,
3519 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003520{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003523 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003524 Py_ssize_t startinpos;
3525 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003526 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003528 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 PyObject *errorHandler = NULL;
3530 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003531 Py_UCS4 maxchar = 0;
3532 Py_ssize_t unicode_size;
3533 Py_ssize_t i;
3534 int kind;
3535 void *data;
3536 int has_errors;
3537 Py_UNICODE *error_outptr;
3538#if SIZEOF_WCHAR_T == 2
3539 Py_ssize_t wchar_offset = 0;
3540#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003541
Walter Dörwald69652032004-09-07 20:24:22 +00003542 if (size == 0) {
3543 if (consumed)
3544 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003545 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003547 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3548 consumed, &has_errors);
3549 if (has_errors) {
3550 unicode = _PyUnicode_New(size);
3551 if (!unicode)
3552 return NULL;
3553 kind = PyUnicode_WCHAR_KIND;
3554 data = PyUnicode_AS_UNICODE(unicode);
3555 assert(data != NULL);
3556 }
3557 else {
3558 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3559 if (!unicode)
3560 return NULL;
3561 /* When the string is ASCII only, just use memcpy and return.
3562 unicode_size may be != size if there is an incomplete UTF-8
3563 sequence at the end of the ASCII block. */
3564 if (maxchar < 128 && size == unicode_size) {
3565 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3566 return (PyObject *)unicode;
3567 }
3568 kind = PyUnicode_KIND(unicode);
3569 data = PyUnicode_DATA(unicode);
3570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003572 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003574 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575
3576 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003577 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578
3579 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003580 /* Fast path for runs of ASCII characters. Given that common UTF-8
3581 input will consist of an overwhelming majority of ASCII
3582 characters, we try to optimize for this case by checking
3583 as many characters as a C 'long' can contain.
3584 First, check if we can do an aligned read, as most CPUs have
3585 a penalty for unaligned reads.
3586 */
3587 if (!((size_t) s & LONG_PTR_MASK)) {
3588 /* Help register allocation */
3589 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003590 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003591 while (_s < aligned_end) {
3592 /* Read a whole long at a time (either 4 or 8 bytes),
3593 and do a fast unrolled copy if it only contains ASCII
3594 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003595 unsigned long value = *(unsigned long *) _s;
3596 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003597 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003598 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3599 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3600 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3601 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003602#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003603 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3604 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3605 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3606 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003607#endif
3608 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003609 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003610 }
3611 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003612 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003613 if (s == e)
3614 break;
3615 ch = (unsigned char)*s;
3616 }
3617 }
3618
3619 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003620 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621 s++;
3622 continue;
3623 }
3624
3625 n = utf8_code_length[ch];
3626
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003627 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003628 if (consumed)
3629 break;
3630 else {
3631 errmsg = "unexpected end of data";
3632 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003633 endinpos = startinpos+1;
3634 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3635 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003636 goto utf8Error;
3637 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003638 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639
3640 switch (n) {
3641
3642 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003643 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003644 startinpos = s-starts;
3645 endinpos = startinpos+1;
3646 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647
3648 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003649 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003650 startinpos = s-starts;
3651 endinpos = startinpos+1;
3652 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653
3654 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003655 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003656 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003657 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003658 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003659 goto utf8Error;
3660 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003662 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003663 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 break;
3665
3666 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003667 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3668 will result in surrogates in range d800-dfff. Surrogates are
3669 not valid UTF-8 so they are rejected.
3670 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3671 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003672 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003673 (s[2] & 0xc0) != 0x80 ||
3674 ((unsigned char)s[0] == 0xE0 &&
3675 (unsigned char)s[1] < 0xA0) ||
3676 ((unsigned char)s[0] == 0xED &&
3677 (unsigned char)s[1] > 0x9F)) {
3678 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003679 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003680 endinpos = startinpos + 1;
3681
3682 /* if s[1] first two bits are 1 and 0, then the invalid
3683 continuation byte is s[2], so increment endinpos by 1,
3684 if not, s[1] is invalid and endinpos doesn't need to
3685 be incremented. */
3686 if ((s[1] & 0xC0) == 0x80)
3687 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 goto utf8Error;
3689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003691 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003692 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003693 break;
3694
3695 case 4:
3696 if ((s[1] & 0xc0) != 0x80 ||
3697 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003698 (s[3] & 0xc0) != 0x80 ||
3699 ((unsigned char)s[0] == 0xF0 &&
3700 (unsigned char)s[1] < 0x90) ||
3701 ((unsigned char)s[0] == 0xF4 &&
3702 (unsigned char)s[1] > 0x8F)) {
3703 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003704 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003705 endinpos = startinpos + 1;
3706 if ((s[1] & 0xC0) == 0x80) {
3707 endinpos++;
3708 if ((s[2] & 0xC0) == 0x80)
3709 endinpos++;
3710 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003711 goto utf8Error;
3712 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003713 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003714 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3715 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 /* If the string is flexible or we have native UCS-4, write
3718 directly.. */
3719 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3720 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003722 else {
3723 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003725 /* translate from 10000..10FFFF to 0..FFFF */
3726 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003728 /* high surrogate = top 10 bits added to D800 */
3729 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3730 (Py_UNICODE)(0xD800 + (ch >> 10)));
3731
3732 /* low surrogate = bottom 10 bits added to DC00 */
3733 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3734 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3735 }
3736#if SIZEOF_WCHAR_T == 2
3737 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003738#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 }
3741 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003742 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003743
Benjamin Peterson29060642009-01-31 22:14:21 +00003744 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003745 /* If this is not yet a resizable string, make it one.. */
3746 if (kind != PyUnicode_WCHAR_KIND) {
3747 const Py_UNICODE *u;
3748 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3749 if (!new_unicode)
3750 goto onError;
3751 u = PyUnicode_AsUnicode((PyObject *)unicode);
3752 if (!u)
3753 goto onError;
3754#if SIZEOF_WCHAR_T == 2
3755 i += wchar_offset;
3756#endif
3757 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3758 Py_DECREF(unicode);
3759 unicode = new_unicode;
3760 kind = 0;
3761 data = PyUnicode_AS_UNICODE(new_unicode);
3762 assert(data != NULL);
3763 }
3764 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003765 if (unicode_decode_call_errorhandler(
3766 errors, &errorHandler,
3767 "utf8", errmsg,
3768 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003770 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 /* Update data because unicode_decode_call_errorhandler might have
3772 re-created or resized the unicode object. */
3773 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003774 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 /* Ensure the unicode_size calculation above was correct: */
3777 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3778
Walter Dörwald69652032004-09-07 20:24:22 +00003779 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003780 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782 /* Adjust length and ready string when it contained errors and
3783 is of the old resizable kind. */
3784 if (kind == PyUnicode_WCHAR_KIND) {
3785 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3786 PyUnicode_READY(unicode) == -1)
3787 goto onError;
3788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003790 Py_XDECREF(errorHandler);
3791 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003792 if (PyUnicode_READY(unicode) == -1) {
3793 Py_DECREF(unicode);
3794 return NULL;
3795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 return (PyObject *)unicode;
3797
Benjamin Peterson29060642009-01-31 22:14:21 +00003798 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003799 Py_XDECREF(errorHandler);
3800 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801 Py_DECREF(unicode);
3802 return NULL;
3803}
3804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003805#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003806
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003807#ifdef __APPLE__
3808
3809/* Simplified UTF-8 decoder using surrogateescape error handler,
3810 used to decode the command line arguments on Mac OS X. */
3811
3812wchar_t*
3813_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3814{
3815 int n;
3816 const char *e;
3817 wchar_t *unicode, *p;
3818
3819 /* Note: size will always be longer than the resulting Unicode
3820 character count */
3821 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3822 PyErr_NoMemory();
3823 return NULL;
3824 }
3825 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3826 if (!unicode)
3827 return NULL;
3828
3829 /* Unpack UTF-8 encoded data */
3830 p = unicode;
3831 e = s + size;
3832 while (s < e) {
3833 Py_UCS4 ch = (unsigned char)*s;
3834
3835 if (ch < 0x80) {
3836 *p++ = (wchar_t)ch;
3837 s++;
3838 continue;
3839 }
3840
3841 n = utf8_code_length[ch];
3842 if (s + n > e) {
3843 goto surrogateescape;
3844 }
3845
3846 switch (n) {
3847 case 0:
3848 case 1:
3849 goto surrogateescape;
3850
3851 case 2:
3852 if ((s[1] & 0xc0) != 0x80)
3853 goto surrogateescape;
3854 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3855 assert ((ch > 0x007F) && (ch <= 0x07FF));
3856 *p++ = (wchar_t)ch;
3857 break;
3858
3859 case 3:
3860 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3861 will result in surrogates in range d800-dfff. Surrogates are
3862 not valid UTF-8 so they are rejected.
3863 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3864 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3865 if ((s[1] & 0xc0) != 0x80 ||
3866 (s[2] & 0xc0) != 0x80 ||
3867 ((unsigned char)s[0] == 0xE0 &&
3868 (unsigned char)s[1] < 0xA0) ||
3869 ((unsigned char)s[0] == 0xED &&
3870 (unsigned char)s[1] > 0x9F)) {
3871
3872 goto surrogateescape;
3873 }
3874 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3875 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003877 break;
3878
3879 case 4:
3880 if ((s[1] & 0xc0) != 0x80 ||
3881 (s[2] & 0xc0) != 0x80 ||
3882 (s[3] & 0xc0) != 0x80 ||
3883 ((unsigned char)s[0] == 0xF0 &&
3884 (unsigned char)s[1] < 0x90) ||
3885 ((unsigned char)s[0] == 0xF4 &&
3886 (unsigned char)s[1] > 0x8F)) {
3887 goto surrogateescape;
3888 }
3889 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3890 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3891 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3892
3893#if SIZEOF_WCHAR_T == 4
3894 *p++ = (wchar_t)ch;
3895#else
3896 /* compute and append the two surrogates: */
3897
3898 /* translate from 10000..10FFFF to 0..FFFF */
3899 ch -= 0x10000;
3900
3901 /* high surrogate = top 10 bits added to D800 */
3902 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3903
3904 /* low surrogate = bottom 10 bits added to DC00 */
3905 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3906#endif
3907 break;
3908 }
3909 s += n;
3910 continue;
3911
3912 surrogateescape:
3913 *p++ = 0xDC00 + ch;
3914 s++;
3915 }
3916 *p = L'\0';
3917 return unicode;
3918}
3919
3920#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922/* Primary internal function which creates utf8 encoded bytes objects.
3923
3924 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003925 and allocate exactly as much space needed at the end. Else allocate the
3926 maximum possible needed (4 result bytes per Unicode character), and return
3927 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003928*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003929PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003930_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931{
Tim Peters602f7402002-04-27 18:03:26 +00003932#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003933
Guido van Rossum98297ee2007-11-06 21:34:58 +00003934 Py_ssize_t i; /* index into s of next input byte */
3935 PyObject *result; /* result string object */
3936 char *p; /* next free byte in output buffer */
3937 Py_ssize_t nallocated; /* number of result bytes allocated */
3938 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003939 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003940 PyObject *errorHandler = NULL;
3941 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942 int kind;
3943 void *data;
3944 Py_ssize_t size;
3945 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3946#if SIZEOF_WCHAR_T == 2
3947 Py_ssize_t wchar_offset = 0;
3948#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003950 if (!PyUnicode_Check(unicode)) {
3951 PyErr_BadArgument();
3952 return NULL;
3953 }
3954
3955 if (PyUnicode_READY(unicode) == -1)
3956 return NULL;
3957
3958 if (_PyUnicode_UTF8(unicode))
3959 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
3960 _PyUnicode_UTF8_LENGTH(unicode));
3961
3962 kind = PyUnicode_KIND(unicode);
3963 data = PyUnicode_DATA(unicode);
3964 size = PyUnicode_GET_LENGTH(unicode);
3965
Tim Peters602f7402002-04-27 18:03:26 +00003966 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967
Tim Peters602f7402002-04-27 18:03:26 +00003968 if (size <= MAX_SHORT_UNICHARS) {
3969 /* Write into the stack buffer; nallocated can't overflow.
3970 * At the end, we'll allocate exactly as much heap space as it
3971 * turns out we need.
3972 */
3973 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003974 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00003975 p = stackbuf;
3976 }
3977 else {
3978 /* Overallocate on the heap, and give the excess back at the end. */
3979 nallocated = size * 4;
3980 if (nallocated / 4 != size) /* overflow! */
3981 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00003982 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003983 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00003984 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003985 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003986 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003987
Tim Peters602f7402002-04-27 18:03:26 +00003988 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003989 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003990
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003991 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00003992 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003994
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00003996 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00003997 *p++ = (char)(0xc0 | (ch >> 6));
3998 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00003999 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000 Py_ssize_t newpos;
4001 PyObject *rep;
4002 Py_ssize_t repsize, k, startpos;
4003 startpos = i-1;
4004#if SIZEOF_WCHAR_T == 2
4005 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004006#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 rep = unicode_encode_call_errorhandler(
4008 errors, &errorHandler, "utf-8", "surrogates not allowed",
4009 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4010 &exc, startpos, startpos+1, &newpos);
4011 if (!rep)
4012 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014 if (PyBytes_Check(rep))
4015 repsize = PyBytes_GET_SIZE(rep);
4016 else
4017 repsize = PyUnicode_GET_SIZE(rep);
4018
4019 if (repsize > 4) {
4020 Py_ssize_t offset;
4021
4022 if (result == NULL)
4023 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004024 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004025 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4028 /* integer overflow */
4029 PyErr_NoMemory();
4030 goto error;
4031 }
4032 nallocated += repsize - 4;
4033 if (result != NULL) {
4034 if (_PyBytes_Resize(&result, nallocated) < 0)
4035 goto error;
4036 } else {
4037 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004038 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039 goto error;
4040 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4041 }
4042 p = PyBytes_AS_STRING(result) + offset;
4043 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 if (PyBytes_Check(rep)) {
4046 char *prep = PyBytes_AS_STRING(rep);
4047 for(k = repsize; k > 0; k--)
4048 *p++ = *prep++;
4049 } else /* rep is unicode */ {
4050 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4051 Py_UNICODE c;
4052
4053 for(k=0; k<repsize; k++) {
4054 c = prep[k];
4055 if (0x80 <= c) {
4056 raise_encode_exception(&exc, "utf-8",
4057 PyUnicode_AS_UNICODE(unicode),
4058 size, i-1, i,
4059 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004060 goto error;
4061 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004063 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004066 } else if (ch < 0x10000) {
4067 *p++ = (char)(0xe0 | (ch >> 12));
4068 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4069 *p++ = (char)(0x80 | (ch & 0x3f));
4070 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004071 /* Encode UCS4 Unicode ordinals */
4072 *p++ = (char)(0xf0 | (ch >> 18));
4073 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4074 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4075 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076#if SIZEOF_WCHAR_T == 2
4077 wchar_offset++;
4078#endif
Tim Peters602f7402002-04-27 18:03:26 +00004079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004081
Guido van Rossum98297ee2007-11-06 21:34:58 +00004082 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004083 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004084 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004085 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004086 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004087 }
4088 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004089 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004090 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004091 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004092 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004094
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004095 Py_XDECREF(errorHandler);
4096 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004097 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004098 error:
4099 Py_XDECREF(errorHandler);
4100 Py_XDECREF(exc);
4101 Py_XDECREF(result);
4102 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004103
Tim Peters602f7402002-04-27 18:03:26 +00004104#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105}
4106
Alexander Belopolsky40018472011-02-26 01:02:56 +00004107PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004108PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4109 Py_ssize_t size,
4110 const char *errors)
4111{
4112 PyObject *v, *unicode;
4113
4114 unicode = PyUnicode_FromUnicode(s, size);
4115 if (unicode == NULL)
4116 return NULL;
4117 v = _PyUnicode_AsUTF8String(unicode, errors);
4118 Py_DECREF(unicode);
4119 return v;
4120}
4121
4122PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004123PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004125 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126}
4127
Walter Dörwald41980ca2007-08-16 21:55:45 +00004128/* --- UTF-32 Codec ------------------------------------------------------- */
4129
4130PyObject *
4131PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 Py_ssize_t size,
4133 const char *errors,
4134 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004135{
4136 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4137}
4138
4139PyObject *
4140PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004141 Py_ssize_t size,
4142 const char *errors,
4143 int *byteorder,
4144 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004145{
4146 const char *starts = s;
4147 Py_ssize_t startinpos;
4148 Py_ssize_t endinpos;
4149 Py_ssize_t outpos;
4150 PyUnicodeObject *unicode;
4151 Py_UNICODE *p;
4152#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004153 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004154 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004155#else
4156 const int pairs = 0;
4157#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004158 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004159 int bo = 0; /* assume native ordering by default */
4160 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004161 /* Offsets from q for retrieving bytes in the right order. */
4162#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4163 int iorder[] = {0, 1, 2, 3};
4164#else
4165 int iorder[] = {3, 2, 1, 0};
4166#endif
4167 PyObject *errorHandler = NULL;
4168 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004169
Walter Dörwald41980ca2007-08-16 21:55:45 +00004170 q = (unsigned char *)s;
4171 e = q + size;
4172
4173 if (byteorder)
4174 bo = *byteorder;
4175
4176 /* Check for BOM marks (U+FEFF) in the input and adjust current
4177 byte order setting accordingly. In native mode, the leading BOM
4178 mark is skipped, in all other modes, it is copied to the output
4179 stream as-is (giving a ZWNBSP character). */
4180 if (bo == 0) {
4181 if (size >= 4) {
4182 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004184#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004185 if (bom == 0x0000FEFF) {
4186 q += 4;
4187 bo = -1;
4188 }
4189 else if (bom == 0xFFFE0000) {
4190 q += 4;
4191 bo = 1;
4192 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004193#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 if (bom == 0x0000FEFF) {
4195 q += 4;
4196 bo = 1;
4197 }
4198 else if (bom == 0xFFFE0000) {
4199 q += 4;
4200 bo = -1;
4201 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004202#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004204 }
4205
4206 if (bo == -1) {
4207 /* force LE */
4208 iorder[0] = 0;
4209 iorder[1] = 1;
4210 iorder[2] = 2;
4211 iorder[3] = 3;
4212 }
4213 else if (bo == 1) {
4214 /* force BE */
4215 iorder[0] = 3;
4216 iorder[1] = 2;
4217 iorder[2] = 1;
4218 iorder[3] = 0;
4219 }
4220
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004221 /* On narrow builds we split characters outside the BMP into two
4222 codepoints => count how much extra space we need. */
4223#ifndef Py_UNICODE_WIDE
4224 for (qq = q; qq < e; qq += 4)
4225 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4226 pairs++;
4227#endif
4228
4229 /* This might be one to much, because of a BOM */
4230 unicode = _PyUnicode_New((size+3)/4+pairs);
4231 if (!unicode)
4232 return NULL;
4233 if (size == 0)
4234 return (PyObject *)unicode;
4235
4236 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004237 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004238
Walter Dörwald41980ca2007-08-16 21:55:45 +00004239 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 Py_UCS4 ch;
4241 /* remaining bytes at the end? (size should be divisible by 4) */
4242 if (e-q<4) {
4243 if (consumed)
4244 break;
4245 errmsg = "truncated data";
4246 startinpos = ((const char *)q)-starts;
4247 endinpos = ((const char *)e)-starts;
4248 goto utf32Error;
4249 /* The remaining input chars are ignored if the callback
4250 chooses to skip the input */
4251 }
4252 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4253 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004254
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 if (ch >= 0x110000)
4256 {
4257 errmsg = "codepoint not in range(0x110000)";
4258 startinpos = ((const char *)q)-starts;
4259 endinpos = startinpos+4;
4260 goto utf32Error;
4261 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004262#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004263 if (ch >= 0x10000)
4264 {
4265 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4266 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4267 }
4268 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004269#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004270 *p++ = ch;
4271 q += 4;
4272 continue;
4273 utf32Error:
4274 outpos = p-PyUnicode_AS_UNICODE(unicode);
4275 if (unicode_decode_call_errorhandler(
4276 errors, &errorHandler,
4277 "utf32", errmsg,
4278 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4279 &unicode, &outpos, &p))
4280 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004281 }
4282
4283 if (byteorder)
4284 *byteorder = bo;
4285
4286 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004288
4289 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004290 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004291 goto onError;
4292
4293 Py_XDECREF(errorHandler);
4294 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004295 if (PyUnicode_READY(unicode) == -1) {
4296 Py_DECREF(unicode);
4297 return NULL;
4298 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004299 return (PyObject *)unicode;
4300
Benjamin Peterson29060642009-01-31 22:14:21 +00004301 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004302 Py_DECREF(unicode);
4303 Py_XDECREF(errorHandler);
4304 Py_XDECREF(exc);
4305 return NULL;
4306}
4307
4308PyObject *
4309PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004310 Py_ssize_t size,
4311 const char *errors,
4312 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004313{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004314 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004315 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004316 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004317#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004318 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004319#else
4320 const int pairs = 0;
4321#endif
4322 /* Offsets from p for storing byte pairs in the right order. */
4323#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4324 int iorder[] = {0, 1, 2, 3};
4325#else
4326 int iorder[] = {3, 2, 1, 0};
4327#endif
4328
Benjamin Peterson29060642009-01-31 22:14:21 +00004329#define STORECHAR(CH) \
4330 do { \
4331 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4332 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4333 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4334 p[iorder[0]] = (CH) & 0xff; \
4335 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004336 } while(0)
4337
4338 /* In narrow builds we can output surrogate pairs as one codepoint,
4339 so we need less space. */
4340#ifndef Py_UNICODE_WIDE
4341 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004342 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4343 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4344 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004345#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004346 nsize = (size - pairs + (byteorder == 0));
4347 bytesize = nsize * 4;
4348 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004349 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004350 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004351 if (v == NULL)
4352 return NULL;
4353
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004354 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004355 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004356 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004357 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004358 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004359
4360 if (byteorder == -1) {
4361 /* force LE */
4362 iorder[0] = 0;
4363 iorder[1] = 1;
4364 iorder[2] = 2;
4365 iorder[3] = 3;
4366 }
4367 else if (byteorder == 1) {
4368 /* force BE */
4369 iorder[0] = 3;
4370 iorder[1] = 2;
4371 iorder[2] = 1;
4372 iorder[3] = 0;
4373 }
4374
4375 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004377#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004378 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4379 Py_UCS4 ch2 = *s;
4380 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4381 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4382 s++;
4383 size--;
4384 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004385 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004386#endif
4387 STORECHAR(ch);
4388 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004389
4390 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004391 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004392#undef STORECHAR
4393}
4394
Alexander Belopolsky40018472011-02-26 01:02:56 +00004395PyObject *
4396PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004397{
4398 if (!PyUnicode_Check(unicode)) {
4399 PyErr_BadArgument();
4400 return NULL;
4401 }
4402 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004403 PyUnicode_GET_SIZE(unicode),
4404 NULL,
4405 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004406}
4407
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408/* --- UTF-16 Codec ------------------------------------------------------- */
4409
Tim Peters772747b2001-08-09 22:21:55 +00004410PyObject *
4411PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 Py_ssize_t size,
4413 const char *errors,
4414 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415{
Walter Dörwald69652032004-09-07 20:24:22 +00004416 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4417}
4418
Antoine Pitrouab868312009-01-10 15:40:25 +00004419/* Two masks for fast checking of whether a C 'long' may contain
4420 UTF16-encoded surrogate characters. This is an efficient heuristic,
4421 assuming that non-surrogate characters with a code point >= 0x8000 are
4422 rare in most input.
4423 FAST_CHAR_MASK is used when the input is in native byte ordering,
4424 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004425*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004426#if (SIZEOF_LONG == 8)
4427# define FAST_CHAR_MASK 0x8000800080008000L
4428# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4429#elif (SIZEOF_LONG == 4)
4430# define FAST_CHAR_MASK 0x80008000L
4431# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4432#else
4433# error C 'long' size should be either 4 or 8!
4434#endif
4435
Walter Dörwald69652032004-09-07 20:24:22 +00004436PyObject *
4437PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 Py_ssize_t size,
4439 const char *errors,
4440 int *byteorder,
4441 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004442{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004444 Py_ssize_t startinpos;
4445 Py_ssize_t endinpos;
4446 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447 PyUnicodeObject *unicode;
4448 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004449 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004450 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004451 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004452 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004453 /* Offsets from q for retrieving byte pairs in the right order. */
4454#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4455 int ihi = 1, ilo = 0;
4456#else
4457 int ihi = 0, ilo = 1;
4458#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 PyObject *errorHandler = NULL;
4460 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461
4462 /* Note: size will always be longer than the resulting Unicode
4463 character count */
4464 unicode = _PyUnicode_New(size);
4465 if (!unicode)
4466 return NULL;
4467 if (size == 0)
4468 return (PyObject *)unicode;
4469
4470 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004471 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004472 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004473 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474
4475 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004476 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004478 /* Check for BOM marks (U+FEFF) in the input and adjust current
4479 byte order setting accordingly. In native mode, the leading BOM
4480 mark is skipped, in all other modes, it is copied to the output
4481 stream as-is (giving a ZWNBSP character). */
4482 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004483 if (size >= 2) {
4484 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004485#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 if (bom == 0xFEFF) {
4487 q += 2;
4488 bo = -1;
4489 }
4490 else if (bom == 0xFFFE) {
4491 q += 2;
4492 bo = 1;
4493 }
Tim Petersced69f82003-09-16 20:30:58 +00004494#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004495 if (bom == 0xFEFF) {
4496 q += 2;
4497 bo = 1;
4498 }
4499 else if (bom == 0xFFFE) {
4500 q += 2;
4501 bo = -1;
4502 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004503#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004505 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506
Tim Peters772747b2001-08-09 22:21:55 +00004507 if (bo == -1) {
4508 /* force LE */
4509 ihi = 1;
4510 ilo = 0;
4511 }
4512 else if (bo == 1) {
4513 /* force BE */
4514 ihi = 0;
4515 ilo = 1;
4516 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004517#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4518 native_ordering = ilo < ihi;
4519#else
4520 native_ordering = ilo > ihi;
4521#endif
Tim Peters772747b2001-08-09 22:21:55 +00004522
Antoine Pitrouab868312009-01-10 15:40:25 +00004523 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004524 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004525 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004526 /* First check for possible aligned read of a C 'long'. Unaligned
4527 reads are more expensive, better to defer to another iteration. */
4528 if (!((size_t) q & LONG_PTR_MASK)) {
4529 /* Fast path for runs of non-surrogate chars. */
4530 register const unsigned char *_q = q;
4531 Py_UNICODE *_p = p;
4532 if (native_ordering) {
4533 /* Native ordering is simple: as long as the input cannot
4534 possibly contain a surrogate char, do an unrolled copy
4535 of several 16-bit code points to the target object.
4536 The non-surrogate check is done on several input bytes
4537 at a time (as many as a C 'long' can contain). */
4538 while (_q < aligned_end) {
4539 unsigned long data = * (unsigned long *) _q;
4540 if (data & FAST_CHAR_MASK)
4541 break;
4542 _p[0] = ((unsigned short *) _q)[0];
4543 _p[1] = ((unsigned short *) _q)[1];
4544#if (SIZEOF_LONG == 8)
4545 _p[2] = ((unsigned short *) _q)[2];
4546 _p[3] = ((unsigned short *) _q)[3];
4547#endif
4548 _q += SIZEOF_LONG;
4549 _p += SIZEOF_LONG / 2;
4550 }
4551 }
4552 else {
4553 /* Byteswapped ordering is similar, but we must decompose
4554 the copy bytewise, and take care of zero'ing out the
4555 upper bytes if the target object is in 32-bit units
4556 (that is, in UCS-4 builds). */
4557 while (_q < aligned_end) {
4558 unsigned long data = * (unsigned long *) _q;
4559 if (data & SWAPPED_FAST_CHAR_MASK)
4560 break;
4561 /* Zero upper bytes in UCS-4 builds */
4562#if (Py_UNICODE_SIZE > 2)
4563 _p[0] = 0;
4564 _p[1] = 0;
4565#if (SIZEOF_LONG == 8)
4566 _p[2] = 0;
4567 _p[3] = 0;
4568#endif
4569#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004570 /* Issue #4916; UCS-4 builds on big endian machines must
4571 fill the two last bytes of each 4-byte unit. */
4572#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4573# define OFF 2
4574#else
4575# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004576#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004577 ((unsigned char *) _p)[OFF + 1] = _q[0];
4578 ((unsigned char *) _p)[OFF + 0] = _q[1];
4579 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4580 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4581#if (SIZEOF_LONG == 8)
4582 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4583 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4584 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4585 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4586#endif
4587#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004588 _q += SIZEOF_LONG;
4589 _p += SIZEOF_LONG / 2;
4590 }
4591 }
4592 p = _p;
4593 q = _q;
4594 if (q >= e)
4595 break;
4596 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004597 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598
Benjamin Peterson14339b62009-01-31 16:36:08 +00004599 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004600
4601 if (ch < 0xD800 || ch > 0xDFFF) {
4602 *p++ = ch;
4603 continue;
4604 }
4605
4606 /* UTF-16 code pair: */
4607 if (q > e) {
4608 errmsg = "unexpected end of data";
4609 startinpos = (((const char *)q) - 2) - starts;
4610 endinpos = ((const char *)e) + 1 - starts;
4611 goto utf16Error;
4612 }
4613 if (0xD800 <= ch && ch <= 0xDBFF) {
4614 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4615 q += 2;
4616 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004617#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 *p++ = ch;
4619 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004620#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004621 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004622#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004623 continue;
4624 }
4625 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004626 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 startinpos = (((const char *)q)-4)-starts;
4628 endinpos = startinpos+2;
4629 goto utf16Error;
4630 }
4631
Benjamin Peterson14339b62009-01-31 16:36:08 +00004632 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004633 errmsg = "illegal encoding";
4634 startinpos = (((const char *)q)-2)-starts;
4635 endinpos = startinpos+2;
4636 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004637
Benjamin Peterson29060642009-01-31 22:14:21 +00004638 utf16Error:
4639 outpos = p - PyUnicode_AS_UNICODE(unicode);
4640 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004641 errors,
4642 &errorHandler,
4643 "utf16", errmsg,
4644 &starts,
4645 (const char **)&e,
4646 &startinpos,
4647 &endinpos,
4648 &exc,
4649 (const char **)&q,
4650 &unicode,
4651 &outpos,
4652 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004653 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004654 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004655 /* remaining byte at the end? (size should be even) */
4656 if (e == q) {
4657 if (!consumed) {
4658 errmsg = "truncated data";
4659 startinpos = ((const char *)q) - starts;
4660 endinpos = ((const char *)e) + 1 - starts;
4661 outpos = p - PyUnicode_AS_UNICODE(unicode);
4662 if (unicode_decode_call_errorhandler(
4663 errors,
4664 &errorHandler,
4665 "utf16", errmsg,
4666 &starts,
4667 (const char **)&e,
4668 &startinpos,
4669 &endinpos,
4670 &exc,
4671 (const char **)&q,
4672 &unicode,
4673 &outpos,
4674 &p))
4675 goto onError;
4676 /* The remaining input chars are ignored if the callback
4677 chooses to skip the input */
4678 }
4679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680
4681 if (byteorder)
4682 *byteorder = bo;
4683
Walter Dörwald69652032004-09-07 20:24:22 +00004684 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004685 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004686
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004688 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 goto onError;
4690
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 Py_XDECREF(errorHandler);
4692 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004693 if (PyUnicode_READY(unicode) == -1) {
4694 Py_DECREF(unicode);
4695 return NULL;
4696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697 return (PyObject *)unicode;
4698
Benjamin Peterson29060642009-01-31 22:14:21 +00004699 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004701 Py_XDECREF(errorHandler);
4702 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703 return NULL;
4704}
4705
Antoine Pitrouab868312009-01-10 15:40:25 +00004706#undef FAST_CHAR_MASK
4707#undef SWAPPED_FAST_CHAR_MASK
4708
Tim Peters772747b2001-08-09 22:21:55 +00004709PyObject *
4710PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004711 Py_ssize_t size,
4712 const char *errors,
4713 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004715 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004716 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004717 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004718#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004719 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004720#else
4721 const int pairs = 0;
4722#endif
Tim Peters772747b2001-08-09 22:21:55 +00004723 /* Offsets from p for storing byte pairs in the right order. */
4724#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4725 int ihi = 1, ilo = 0;
4726#else
4727 int ihi = 0, ilo = 1;
4728#endif
4729
Benjamin Peterson29060642009-01-31 22:14:21 +00004730#define STORECHAR(CH) \
4731 do { \
4732 p[ihi] = ((CH) >> 8) & 0xff; \
4733 p[ilo] = (CH) & 0xff; \
4734 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004735 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004737#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004738 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004739 if (s[i] >= 0x10000)
4740 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004741#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004742 /* 2 * (size + pairs + (byteorder == 0)) */
4743 if (size > PY_SSIZE_T_MAX ||
4744 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004745 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004746 nsize = size + pairs + (byteorder == 0);
4747 bytesize = nsize * 2;
4748 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004749 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004750 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751 if (v == NULL)
4752 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004754 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004757 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004758 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004759
4760 if (byteorder == -1) {
4761 /* force LE */
4762 ihi = 1;
4763 ilo = 0;
4764 }
4765 else if (byteorder == 1) {
4766 /* force BE */
4767 ihi = 0;
4768 ilo = 1;
4769 }
4770
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004771 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004772 Py_UNICODE ch = *s++;
4773 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004774#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004775 if (ch >= 0x10000) {
4776 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4777 ch = 0xD800 | ((ch-0x10000) >> 10);
4778 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004779#endif
Tim Peters772747b2001-08-09 22:21:55 +00004780 STORECHAR(ch);
4781 if (ch2)
4782 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004783 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004784
4785 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004786 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004787#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788}
4789
Alexander Belopolsky40018472011-02-26 01:02:56 +00004790PyObject *
4791PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792{
4793 if (!PyUnicode_Check(unicode)) {
4794 PyErr_BadArgument();
4795 return NULL;
4796 }
4797 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 PyUnicode_GET_SIZE(unicode),
4799 NULL,
4800 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801}
4802
4803/* --- Unicode Escape Codec ----------------------------------------------- */
4804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004805/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4806 if all the escapes in the string make it still a valid ASCII string.
4807 Returns -1 if any escapes were found which cause the string to
4808 pop out of ASCII range. Otherwise returns the length of the
4809 required buffer to hold the string.
4810 */
4811Py_ssize_t
4812length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4813{
4814 const unsigned char *p = (const unsigned char *)s;
4815 const unsigned char *end = p + size;
4816 Py_ssize_t length = 0;
4817
4818 if (size < 0)
4819 return -1;
4820
4821 for (; p < end; ++p) {
4822 if (*p > 127) {
4823 /* Non-ASCII */
4824 return -1;
4825 }
4826 else if (*p != '\\') {
4827 /* Normal character */
4828 ++length;
4829 }
4830 else {
4831 /* Backslash-escape, check next char */
4832 ++p;
4833 /* Escape sequence reaches till end of string or
4834 non-ASCII follow-up. */
4835 if (p >= end || *p > 127)
4836 return -1;
4837 switch (*p) {
4838 case '\n':
4839 /* backslash + \n result in zero characters */
4840 break;
4841 case '\\': case '\'': case '\"':
4842 case 'b': case 'f': case 't':
4843 case 'n': case 'r': case 'v': case 'a':
4844 ++length;
4845 break;
4846 case '0': case '1': case '2': case '3':
4847 case '4': case '5': case '6': case '7':
4848 case 'x': case 'u': case 'U': case 'N':
4849 /* these do not guarantee ASCII characters */
4850 return -1;
4851 default:
4852 /* count the backslash + the other character */
4853 length += 2;
4854 }
4855 }
4856 }
4857 return length;
4858}
4859
4860/* Similar to PyUnicode_WRITE but either write into wstr field
4861 or treat string as ASCII. */
4862#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4863 do { \
4864 if ((kind) != PyUnicode_WCHAR_KIND) \
4865 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4866 else \
4867 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4868 } while (0)
4869
4870#define WRITE_WSTR(buf, index, value) \
4871 assert(kind == PyUnicode_WCHAR_KIND), \
4872 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4873
4874
Fredrik Lundh06d12682001-01-24 07:59:11 +00004875static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004876
Alexander Belopolsky40018472011-02-26 01:02:56 +00004877PyObject *
4878PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004879 Py_ssize_t size,
4880 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004883 Py_ssize_t startinpos;
4884 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004885 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004889 char* message;
4890 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891 PyObject *errorHandler = NULL;
4892 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893 Py_ssize_t ascii_length;
4894 Py_ssize_t i;
4895 int kind;
4896 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004898 ascii_length = length_of_escaped_ascii_string(s, size);
4899
4900 /* After length_of_escaped_ascii_string() there are two alternatives,
4901 either the string is pure ASCII with named escapes like \n, etc.
4902 and we determined it's exact size (common case)
4903 or it contains \x, \u, ... escape sequences. then we create a
4904 legacy wchar string and resize it at the end of this function. */
4905 if (ascii_length >= 0) {
4906 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4907 if (!v)
4908 goto onError;
4909 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4910 kind = PyUnicode_1BYTE_KIND;
4911 data = PyUnicode_DATA(v);
4912 }
4913 else {
4914 /* Escaped strings will always be longer than the resulting
4915 Unicode string, so we start with size here and then reduce the
4916 length after conversion to the true value.
4917 (but if the error callback returns a long replacement string
4918 we'll have to allocate more space) */
4919 v = _PyUnicode_New(size);
4920 if (!v)
4921 goto onError;
4922 kind = PyUnicode_WCHAR_KIND;
4923 data = PyUnicode_AS_UNICODE(v);
4924 }
4925
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926 if (size == 0)
4927 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004928 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004930
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 while (s < end) {
4932 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004933 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004934 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004936 if (kind == PyUnicode_WCHAR_KIND) {
4937 assert(i < _PyUnicode_WSTR_LENGTH(v));
4938 }
4939 else {
4940 /* The only case in which i == ascii_length is a backslash
4941 followed by a newline. */
4942 assert(i <= ascii_length);
4943 }
4944
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 /* Non-escape characters are interpreted as Unicode ordinals */
4946 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004947 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948 continue;
4949 }
4950
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004951 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 /* \ - Escapes */
4953 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004954 c = *s++;
4955 if (s > end)
4956 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004957
4958 if (kind == PyUnicode_WCHAR_KIND) {
4959 assert(i < _PyUnicode_WSTR_LENGTH(v));
4960 }
4961 else {
4962 /* The only case in which i == ascii_length is a backslash
4963 followed by a newline. */
4964 assert(i < ascii_length || (i == ascii_length && c == '\n'));
4965 }
4966
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004967 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004971 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
4972 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
4973 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
4974 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
4975 /* FF */
4976 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
4977 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
4978 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
4979 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
4980 /* VT */
4981 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
4982 /* BEL, not classic C */
4983 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984
Benjamin Peterson29060642009-01-31 22:14:21 +00004985 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 case '0': case '1': case '2': case '3':
4987 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004988 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004989 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004990 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004991 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004992 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004994 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995 break;
4996
Benjamin Peterson29060642009-01-31 22:14:21 +00004997 /* hex escapes */
4998 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005000 digits = 2;
5001 message = "truncated \\xXX escape";
5002 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005006 digits = 4;
5007 message = "truncated \\uXXXX escape";
5008 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005011 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005012 digits = 8;
5013 message = "truncated \\UXXXXXXXX escape";
5014 hexescape:
5015 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005016 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005017 if (s+digits>end) {
5018 endinpos = size;
5019 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 errors, &errorHandler,
5021 "unicodeescape", "end of string in escape sequence",
5022 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005023 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005024 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005025 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005026 goto nextByte;
5027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005028 for (j = 0; j < digits; ++j) {
5029 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005030 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005031 endinpos = (s+j+1)-starts;
5032 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005033 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005034 errors, &errorHandler,
5035 "unicodeescape", message,
5036 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005037 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005038 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005039 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005041 }
5042 chr = (chr<<4) & ~0xF;
5043 if (c >= '0' && c <= '9')
5044 chr += c - '0';
5045 else if (c >= 'a' && c <= 'f')
5046 chr += 10 + c - 'a';
5047 else
5048 chr += 10 + c - 'A';
5049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005050 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005051 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005052 /* _decoding_error will have already written into the
5053 target buffer. */
5054 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005055 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005056 /* when we get here, chr is a 32-bit unicode character */
5057 if (chr <= 0xffff)
5058 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005059 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005060 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005061 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005062 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005063#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005064 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005065#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005066 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005067 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5068 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005069#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005070 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005071 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005072 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005073 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 errors, &errorHandler,
5075 "unicodeescape", "illegal Unicode character",
5076 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005077 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005078 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005079 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005080 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005081 break;
5082
Benjamin Peterson29060642009-01-31 22:14:21 +00005083 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005084 case 'N':
5085 message = "malformed \\N character escape";
5086 if (ucnhash_CAPI == NULL) {
5087 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005088 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5089 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005090 if (ucnhash_CAPI == NULL)
5091 goto ucnhashError;
5092 }
5093 if (*s == '{') {
5094 const char *start = s+1;
5095 /* look for the closing brace */
5096 while (*s != '}' && s < end)
5097 s++;
5098 if (s > start && s < end && *s == '}') {
5099 /* found a name. look it up in the unicode database */
5100 message = "unknown Unicode character name";
5101 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005102 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5103 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005104 goto store;
5105 }
5106 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005107 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005108 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005109 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 errors, &errorHandler,
5111 "unicodeescape", message,
5112 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005113 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005114 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005115 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005116 break;
5117
5118 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005119 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005120 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005121 message = "\\ at end of string";
5122 s--;
5123 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005124 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005126 errors, &errorHandler,
5127 "unicodeescape", message,
5128 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005129 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005130 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005131 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005132 }
5133 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005134 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5135 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005136 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005137 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005139 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005142 /* Ensure the length prediction worked in case of ASCII strings */
5143 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5144
5145 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5146 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005147 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005148 Py_XDECREF(errorHandler);
5149 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005151
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005153 PyErr_SetString(
5154 PyExc_UnicodeError,
5155 "\\N escapes not supported (can't load unicodedata module)"
5156 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005157 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005158 Py_XDECREF(errorHandler);
5159 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005160 return NULL;
5161
Benjamin Peterson29060642009-01-31 22:14:21 +00005162 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005164 Py_XDECREF(errorHandler);
5165 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 return NULL;
5167}
5168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005169#undef WRITE_ASCII_OR_WSTR
5170#undef WRITE_WSTR
5171
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172/* Return a Unicode-Escape string version of the Unicode object.
5173
5174 If quotes is true, the string is enclosed in u"" or u'' quotes as
5175 appropriate.
5176
5177*/
5178
Walter Dörwald79e913e2007-05-12 11:08:06 +00005179static const char *hexdigits = "0123456789abcdef";
5180
Alexander Belopolsky40018472011-02-26 01:02:56 +00005181PyObject *
5182PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005183 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005185 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005188#ifdef Py_UNICODE_WIDE
5189 const Py_ssize_t expandsize = 10;
5190#else
5191 const Py_ssize_t expandsize = 6;
5192#endif
5193
Thomas Wouters89f507f2006-12-13 04:49:30 +00005194 /* XXX(nnorwitz): rather than over-allocating, it would be
5195 better to choose a different scheme. Perhaps scan the
5196 first N-chars of the string and allocate based on that size.
5197 */
5198 /* Initial allocation is based on the longest-possible unichr
5199 escape.
5200
5201 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5202 unichr, so in this case it's the longest unichr escape. In
5203 narrow (UTF-16) builds this is five chars per source unichr
5204 since there are two unichrs in the surrogate pair, so in narrow
5205 (UTF-16) builds it's not the longest unichr escape.
5206
5207 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5208 so in the narrow (UTF-16) build case it's the longest unichr
5209 escape.
5210 */
5211
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005212 if (size == 0)
5213 return PyBytes_FromStringAndSize(NULL, 0);
5214
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005215 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005217
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005218 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005219 2
5220 + expandsize*size
5221 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 if (repr == NULL)
5223 return NULL;
5224
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005225 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 while (size-- > 0) {
5228 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005229
Walter Dörwald79e913e2007-05-12 11:08:06 +00005230 /* Escape backslashes */
5231 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 *p++ = '\\';
5233 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005234 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005235 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005236
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005237#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005238 /* Map 21-bit characters to '\U00xxxxxx' */
5239 else if (ch >= 0x10000) {
5240 *p++ = '\\';
5241 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005242 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5243 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5244 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5245 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5246 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5247 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5248 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5249 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005251 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005252#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5254 else if (ch >= 0xD800 && ch < 0xDC00) {
5255 Py_UNICODE ch2;
5256 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005257
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 ch2 = *s++;
5259 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005260 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5262 *p++ = '\\';
5263 *p++ = 'U';
5264 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5265 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5266 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5267 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5268 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5269 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5270 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5271 *p++ = hexdigits[ucs & 0x0000000F];
5272 continue;
5273 }
5274 /* Fall through: isolated surrogates are copied as-is */
5275 s--;
5276 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005277 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005278#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005279
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005281 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 *p++ = '\\';
5283 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005284 *p++ = hexdigits[(ch >> 12) & 0x000F];
5285 *p++ = hexdigits[(ch >> 8) & 0x000F];
5286 *p++ = hexdigits[(ch >> 4) & 0x000F];
5287 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005289
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005290 /* Map special whitespace to '\t', \n', '\r' */
5291 else if (ch == '\t') {
5292 *p++ = '\\';
5293 *p++ = 't';
5294 }
5295 else if (ch == '\n') {
5296 *p++ = '\\';
5297 *p++ = 'n';
5298 }
5299 else if (ch == '\r') {
5300 *p++ = '\\';
5301 *p++ = 'r';
5302 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005303
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005304 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005305 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005307 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005308 *p++ = hexdigits[(ch >> 4) & 0x000F];
5309 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005310 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005311
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312 /* Copy everything else as-is */
5313 else
5314 *p++ = (char) ch;
5315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005317 assert(p - PyBytes_AS_STRING(repr) > 0);
5318 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5319 return NULL;
5320 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321}
5322
Alexander Belopolsky40018472011-02-26 01:02:56 +00005323PyObject *
5324PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005326 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 if (!PyUnicode_Check(unicode)) {
5328 PyErr_BadArgument();
5329 return NULL;
5330 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005331 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5332 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005333 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334}
5335
5336/* --- Raw Unicode Escape Codec ------------------------------------------- */
5337
Alexander Belopolsky40018472011-02-26 01:02:56 +00005338PyObject *
5339PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005340 Py_ssize_t size,
5341 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005343 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005344 Py_ssize_t startinpos;
5345 Py_ssize_t endinpos;
5346 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005348 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 const char *end;
5350 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005351 PyObject *errorHandler = NULL;
5352 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005353
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 /* Escaped strings will always be longer than the resulting
5355 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356 length after conversion to the true value. (But decoding error
5357 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 v = _PyUnicode_New(size);
5359 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364 end = s + size;
5365 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 unsigned char c;
5367 Py_UCS4 x;
5368 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005369 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 /* Non-escape characters are interpreted as Unicode ordinals */
5372 if (*s != '\\') {
5373 *p++ = (unsigned char)*s++;
5374 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005375 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 startinpos = s-starts;
5377
5378 /* \u-escapes are only interpreted iff the number of leading
5379 backslashes if odd */
5380 bs = s;
5381 for (;s < end;) {
5382 if (*s != '\\')
5383 break;
5384 *p++ = (unsigned char)*s++;
5385 }
5386 if (((s - bs) & 1) == 0 ||
5387 s >= end ||
5388 (*s != 'u' && *s != 'U')) {
5389 continue;
5390 }
5391 p--;
5392 count = *s=='u' ? 4 : 8;
5393 s++;
5394
5395 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5396 outpos = p-PyUnicode_AS_UNICODE(v);
5397 for (x = 0, i = 0; i < count; ++i, ++s) {
5398 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005399 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 endinpos = s-starts;
5401 if (unicode_decode_call_errorhandler(
5402 errors, &errorHandler,
5403 "rawunicodeescape", "truncated \\uXXXX",
5404 &starts, &end, &startinpos, &endinpos, &exc, &s,
5405 &v, &outpos, &p))
5406 goto onError;
5407 goto nextByte;
5408 }
5409 x = (x<<4) & ~0xF;
5410 if (c >= '0' && c <= '9')
5411 x += c - '0';
5412 else if (c >= 'a' && c <= 'f')
5413 x += 10 + c - 'a';
5414 else
5415 x += 10 + c - 'A';
5416 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005417 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 /* UCS-2 character */
5419 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005420 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 /* UCS-4 character. Either store directly, or as
5422 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005423#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005425#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005426 x -= 0x10000L;
5427 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5428 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005429#endif
5430 } else {
5431 endinpos = s-starts;
5432 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005433 if (unicode_decode_call_errorhandler(
5434 errors, &errorHandler,
5435 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 &starts, &end, &startinpos, &endinpos, &exc, &s,
5437 &v, &outpos, &p))
5438 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005439 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 nextByte:
5441 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005443 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 Py_XDECREF(errorHandler);
5446 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005447 if (PyUnicode_READY(v) == -1) {
5448 Py_DECREF(v);
5449 return NULL;
5450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005452
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005455 Py_XDECREF(errorHandler);
5456 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 return NULL;
5458}
5459
Alexander Belopolsky40018472011-02-26 01:02:56 +00005460PyObject *
5461PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005462 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005464 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465 char *p;
5466 char *q;
5467
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005468#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005469 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005470#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005471 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005472#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005473
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005474 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005476
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005477 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 if (repr == NULL)
5479 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005480 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005481 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005483 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 while (size-- > 0) {
5485 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005486#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005487 /* Map 32-bit characters to '\Uxxxxxxxx' */
5488 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005489 *p++ = '\\';
5490 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005491 *p++ = hexdigits[(ch >> 28) & 0xf];
5492 *p++ = hexdigits[(ch >> 24) & 0xf];
5493 *p++ = hexdigits[(ch >> 20) & 0xf];
5494 *p++ = hexdigits[(ch >> 16) & 0xf];
5495 *p++ = hexdigits[(ch >> 12) & 0xf];
5496 *p++ = hexdigits[(ch >> 8) & 0xf];
5497 *p++ = hexdigits[(ch >> 4) & 0xf];
5498 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005499 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005500 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005501#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005502 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5503 if (ch >= 0xD800 && ch < 0xDC00) {
5504 Py_UNICODE ch2;
5505 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005506
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 ch2 = *s++;
5508 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005509 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5511 *p++ = '\\';
5512 *p++ = 'U';
5513 *p++ = hexdigits[(ucs >> 28) & 0xf];
5514 *p++ = hexdigits[(ucs >> 24) & 0xf];
5515 *p++ = hexdigits[(ucs >> 20) & 0xf];
5516 *p++ = hexdigits[(ucs >> 16) & 0xf];
5517 *p++ = hexdigits[(ucs >> 12) & 0xf];
5518 *p++ = hexdigits[(ucs >> 8) & 0xf];
5519 *p++ = hexdigits[(ucs >> 4) & 0xf];
5520 *p++ = hexdigits[ucs & 0xf];
5521 continue;
5522 }
5523 /* Fall through: isolated surrogates are copied as-is */
5524 s--;
5525 size++;
5526 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005527#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 /* Map 16-bit characters to '\uxxxx' */
5529 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530 *p++ = '\\';
5531 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005532 *p++ = hexdigits[(ch >> 12) & 0xf];
5533 *p++ = hexdigits[(ch >> 8) & 0xf];
5534 *p++ = hexdigits[(ch >> 4) & 0xf];
5535 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 /* Copy everything else as-is */
5538 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 *p++ = (char) ch;
5540 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005541 size = p - q;
5542
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005543 assert(size > 0);
5544 if (_PyBytes_Resize(&repr, size) < 0)
5545 return NULL;
5546 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547}
5548
Alexander Belopolsky40018472011-02-26 01:02:56 +00005549PyObject *
5550PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005552 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005554 PyErr_BadArgument();
5555 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005557 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5558 PyUnicode_GET_SIZE(unicode));
5559
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005560 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561}
5562
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005563/* --- Unicode Internal Codec ------------------------------------------- */
5564
Alexander Belopolsky40018472011-02-26 01:02:56 +00005565PyObject *
5566_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005567 Py_ssize_t size,
5568 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005569{
5570 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005571 Py_ssize_t startinpos;
5572 Py_ssize_t endinpos;
5573 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005574 PyUnicodeObject *v;
5575 Py_UNICODE *p;
5576 const char *end;
5577 const char *reason;
5578 PyObject *errorHandler = NULL;
5579 PyObject *exc = NULL;
5580
Neal Norwitzd43069c2006-01-08 01:12:10 +00005581#ifdef Py_UNICODE_WIDE
5582 Py_UNICODE unimax = PyUnicode_GetMax();
5583#endif
5584
Thomas Wouters89f507f2006-12-13 04:49:30 +00005585 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005586 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5587 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005589 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5590 as string was created with the old API. */
5591 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005593 p = PyUnicode_AS_UNICODE(v);
5594 end = s + size;
5595
5596 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005597 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005598 /* We have to sanity check the raw data, otherwise doom looms for
5599 some malformed UCS-4 data. */
5600 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005601#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005602 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005603#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005604 end-s < Py_UNICODE_SIZE
5605 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005607 startinpos = s - starts;
5608 if (end-s < Py_UNICODE_SIZE) {
5609 endinpos = end-starts;
5610 reason = "truncated input";
5611 }
5612 else {
5613 endinpos = s - starts + Py_UNICODE_SIZE;
5614 reason = "illegal code point (> 0x10FFFF)";
5615 }
5616 outpos = p - PyUnicode_AS_UNICODE(v);
5617 if (unicode_decode_call_errorhandler(
5618 errors, &errorHandler,
5619 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005620 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005621 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005622 goto onError;
5623 }
5624 }
5625 else {
5626 p++;
5627 s += Py_UNICODE_SIZE;
5628 }
5629 }
5630
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005631 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005632 goto onError;
5633 Py_XDECREF(errorHandler);
5634 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005635 if (PyUnicode_READY(v) == -1) {
5636 Py_DECREF(v);
5637 return NULL;
5638 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005639 return (PyObject *)v;
5640
Benjamin Peterson29060642009-01-31 22:14:21 +00005641 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005642 Py_XDECREF(v);
5643 Py_XDECREF(errorHandler);
5644 Py_XDECREF(exc);
5645 return NULL;
5646}
5647
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648/* --- Latin-1 Codec ------------------------------------------------------ */
5649
Alexander Belopolsky40018472011-02-26 01:02:56 +00005650PyObject *
5651PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005652 Py_ssize_t size,
5653 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005656 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657}
5658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005659/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005660static void
5661make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005662 const char *encoding,
5663 const Py_UNICODE *unicode, Py_ssize_t size,
5664 Py_ssize_t startpos, Py_ssize_t endpos,
5665 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 *exceptionObject = PyUnicodeEncodeError_Create(
5669 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 }
5671 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5673 goto onError;
5674 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5675 goto onError;
5676 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5677 goto onError;
5678 return;
5679 onError:
5680 Py_DECREF(*exceptionObject);
5681 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 }
5683}
5684
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005686static void
5687raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005688 const char *encoding,
5689 const Py_UNICODE *unicode, Py_ssize_t size,
5690 Py_ssize_t startpos, Py_ssize_t endpos,
5691 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692{
5693 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005695 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697}
5698
5699/* error handling callback helper:
5700 build arguments, call the callback and check the arguments,
5701 put the result into newpos and return the replacement string, which
5702 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005703static PyObject *
5704unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005705 PyObject **errorHandler,
5706 const char *encoding, const char *reason,
5707 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5708 Py_ssize_t startpos, Py_ssize_t endpos,
5709 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005711 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712
5713 PyObject *restuple;
5714 PyObject *resunicode;
5715
5716 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 }
5721
5722 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726
5727 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005729 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005732 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 Py_DECREF(restuple);
5734 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005736 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 &resunicode, newpos)) {
5738 Py_DECREF(restuple);
5739 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005741 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5742 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5743 Py_DECREF(restuple);
5744 return NULL;
5745 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005748 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5750 Py_DECREF(restuple);
5751 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005752 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753 Py_INCREF(resunicode);
5754 Py_DECREF(restuple);
5755 return resunicode;
5756}
5757
Alexander Belopolsky40018472011-02-26 01:02:56 +00005758static PyObject *
5759unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005760 Py_ssize_t size,
5761 const char *errors,
5762 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005763{
5764 /* output object */
5765 PyObject *res;
5766 /* pointers to the beginning and end+1 of input */
5767 const Py_UNICODE *startp = p;
5768 const Py_UNICODE *endp = p + size;
5769 /* pointer to the beginning of the unencodable characters */
5770 /* const Py_UNICODE *badp = NULL; */
5771 /* pointer into the output */
5772 char *str;
5773 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005774 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005775 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5776 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 PyObject *errorHandler = NULL;
5778 PyObject *exc = NULL;
5779 /* the following variable is used for caching string comparisons
5780 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5781 int known_errorHandler = -1;
5782
5783 /* allocate enough for a simple encoding without
5784 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005785 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005786 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005787 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005789 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005790 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 ressize = size;
5792
5793 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 /* can we encode this? */
5797 if (c<limit) {
5798 /* no overflow check, because we know that the space is enough */
5799 *str++ = (char)c;
5800 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005801 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 else {
5803 Py_ssize_t unicodepos = p-startp;
5804 Py_ssize_t requiredsize;
5805 PyObject *repunicode;
5806 Py_ssize_t repsize;
5807 Py_ssize_t newpos;
5808 Py_ssize_t respos;
5809 Py_UNICODE *uni2;
5810 /* startpos for collecting unencodable chars */
5811 const Py_UNICODE *collstart = p;
5812 const Py_UNICODE *collend = p;
5813 /* find all unecodable characters */
5814 while ((collend < endp) && ((*collend)>=limit))
5815 ++collend;
5816 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5817 if (known_errorHandler==-1) {
5818 if ((errors==NULL) || (!strcmp(errors, "strict")))
5819 known_errorHandler = 1;
5820 else if (!strcmp(errors, "replace"))
5821 known_errorHandler = 2;
5822 else if (!strcmp(errors, "ignore"))
5823 known_errorHandler = 3;
5824 else if (!strcmp(errors, "xmlcharrefreplace"))
5825 known_errorHandler = 4;
5826 else
5827 known_errorHandler = 0;
5828 }
5829 switch (known_errorHandler) {
5830 case 1: /* strict */
5831 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5832 goto onError;
5833 case 2: /* replace */
5834 while (collstart++<collend)
5835 *str++ = '?'; /* fall through */
5836 case 3: /* ignore */
5837 p = collend;
5838 break;
5839 case 4: /* xmlcharrefreplace */
5840 respos = str - PyBytes_AS_STRING(res);
5841 /* determine replacement size (temporarily (mis)uses p) */
5842 for (p = collstart, repsize = 0; p < collend; ++p) {
5843 if (*p<10)
5844 repsize += 2+1+1;
5845 else if (*p<100)
5846 repsize += 2+2+1;
5847 else if (*p<1000)
5848 repsize += 2+3+1;
5849 else if (*p<10000)
5850 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005851#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 else
5853 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005854#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 else if (*p<100000)
5856 repsize += 2+5+1;
5857 else if (*p<1000000)
5858 repsize += 2+6+1;
5859 else
5860 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005861#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005862 }
5863 requiredsize = respos+repsize+(endp-collend);
5864 if (requiredsize > ressize) {
5865 if (requiredsize<2*ressize)
5866 requiredsize = 2*ressize;
5867 if (_PyBytes_Resize(&res, requiredsize))
5868 goto onError;
5869 str = PyBytes_AS_STRING(res) + respos;
5870 ressize = requiredsize;
5871 }
5872 /* generate replacement (temporarily (mis)uses p) */
5873 for (p = collstart; p < collend; ++p) {
5874 str += sprintf(str, "&#%d;", (int)*p);
5875 }
5876 p = collend;
5877 break;
5878 default:
5879 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5880 encoding, reason, startp, size, &exc,
5881 collstart-startp, collend-startp, &newpos);
5882 if (repunicode == NULL)
5883 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005884 if (PyBytes_Check(repunicode)) {
5885 /* Directly copy bytes result to output. */
5886 repsize = PyBytes_Size(repunicode);
5887 if (repsize > 1) {
5888 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005889 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005890 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5891 Py_DECREF(repunicode);
5892 goto onError;
5893 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005894 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005895 ressize += repsize-1;
5896 }
5897 memcpy(str, PyBytes_AsString(repunicode), repsize);
5898 str += repsize;
5899 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005900 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005901 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005902 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 /* need more space? (at least enough for what we
5904 have+the replacement+the rest of the string, so
5905 we won't have to check space for encodable characters) */
5906 respos = str - PyBytes_AS_STRING(res);
5907 repsize = PyUnicode_GET_SIZE(repunicode);
5908 requiredsize = respos+repsize+(endp-collend);
5909 if (requiredsize > ressize) {
5910 if (requiredsize<2*ressize)
5911 requiredsize = 2*ressize;
5912 if (_PyBytes_Resize(&res, requiredsize)) {
5913 Py_DECREF(repunicode);
5914 goto onError;
5915 }
5916 str = PyBytes_AS_STRING(res) + respos;
5917 ressize = requiredsize;
5918 }
5919 /* check if there is anything unencodable in the replacement
5920 and copy it to the output */
5921 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5922 c = *uni2;
5923 if (c >= limit) {
5924 raise_encode_exception(&exc, encoding, startp, size,
5925 unicodepos, unicodepos+1, reason);
5926 Py_DECREF(repunicode);
5927 goto onError;
5928 }
5929 *str = (char)c;
5930 }
5931 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005932 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005933 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005934 }
5935 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005936 /* Resize if we allocated to much */
5937 size = str - PyBytes_AS_STRING(res);
5938 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005939 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005940 if (_PyBytes_Resize(&res, size) < 0)
5941 goto onError;
5942 }
5943
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005944 Py_XDECREF(errorHandler);
5945 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005946 return res;
5947
5948 onError:
5949 Py_XDECREF(res);
5950 Py_XDECREF(errorHandler);
5951 Py_XDECREF(exc);
5952 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005953}
5954
Alexander Belopolsky40018472011-02-26 01:02:56 +00005955PyObject *
5956PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005957 Py_ssize_t size,
5958 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005960 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961}
5962
Alexander Belopolsky40018472011-02-26 01:02:56 +00005963PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005964_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965{
5966 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005967 PyErr_BadArgument();
5968 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005970 if (PyUnicode_READY(unicode) == -1)
5971 return NULL;
5972 /* Fast path: if it is a one-byte string, construct
5973 bytes object directly. */
5974 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
5975 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
5976 PyUnicode_GET_LENGTH(unicode));
5977 /* Non-Latin-1 characters present. Defer to above function to
5978 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005981 errors);
5982}
5983
5984PyObject*
5985PyUnicode_AsLatin1String(PyObject *unicode)
5986{
5987 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988}
5989
5990/* --- 7-bit ASCII Codec -------------------------------------------------- */
5991
Alexander Belopolsky40018472011-02-26 01:02:56 +00005992PyObject *
5993PyUnicode_DecodeASCII(const char *s,
5994 Py_ssize_t size,
5995 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005997 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 PyUnicodeObject *v;
5999 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006000 Py_ssize_t startinpos;
6001 Py_ssize_t endinpos;
6002 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006003 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006004 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006005 PyObject *errorHandler = NULL;
6006 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006007 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006008
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006010 if (size == 1 && *(unsigned char*)s < 128)
6011 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6012
6013 /* Fast path. Assume the input actually *is* ASCII, and allocate
6014 a single-block Unicode object with that assumption. If there is
6015 an error, drop the object and start over. */
6016 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6017 if (v == NULL)
6018 goto onError;
6019 d = PyUnicode_1BYTE_DATA(v);
6020 for (i = 0; i < size; i++) {
6021 unsigned char ch = ((unsigned char*)s)[i];
6022 if (ch < 128)
6023 d[i] = ch;
6024 else
6025 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006027 if (i == size)
6028 return (PyObject*)v;
6029 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006030
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 v = _PyUnicode_New(size);
6032 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006037 e = s + size;
6038 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 register unsigned char c = (unsigned char)*s;
6040 if (c < 128) {
6041 *p++ = c;
6042 ++s;
6043 }
6044 else {
6045 startinpos = s-starts;
6046 endinpos = startinpos + 1;
6047 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6048 if (unicode_decode_call_errorhandler(
6049 errors, &errorHandler,
6050 "ascii", "ordinal not in range(128)",
6051 &starts, &e, &startinpos, &endinpos, &exc, &s,
6052 &v, &outpos, &p))
6053 goto onError;
6054 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006056 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059 Py_XDECREF(errorHandler);
6060 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006061 if (PyUnicode_READY(v) == -1) {
6062 Py_DECREF(v);
6063 return NULL;
6064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006066
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006069 Py_XDECREF(errorHandler);
6070 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 return NULL;
6072}
6073
Alexander Belopolsky40018472011-02-26 01:02:56 +00006074PyObject *
6075PyUnicode_EncodeASCII(const Py_UNICODE *p,
6076 Py_ssize_t size,
6077 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080}
6081
Alexander Belopolsky40018472011-02-26 01:02:56 +00006082PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006083_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084{
6085 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 PyErr_BadArgument();
6087 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006089 if (PyUnicode_READY(unicode) == -1)
6090 return NULL;
6091 /* Fast path: if it is an ASCII-only string, construct bytes object
6092 directly. Else defer to above function to raise the exception. */
6093 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6094 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6095 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006098 errors);
6099}
6100
6101PyObject *
6102PyUnicode_AsASCIIString(PyObject *unicode)
6103{
6104 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105}
6106
Victor Stinner99b95382011-07-04 14:23:54 +02006107#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006108
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006109/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006110
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006111#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006112#define NEED_RETRY
6113#endif
6114
6115/* XXX This code is limited to "true" double-byte encodings, as
6116 a) it assumes an incomplete character consists of a single byte, and
6117 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006119
Alexander Belopolsky40018472011-02-26 01:02:56 +00006120static int
6121is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006122{
6123 const char *curr = s + offset;
6124
6125 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 const char *prev = CharPrev(s, curr);
6127 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006128 }
6129 return 0;
6130}
6131
6132/*
6133 * Decode MBCS string into unicode object. If 'final' is set, converts
6134 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6135 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006136static int
6137decode_mbcs(PyUnicodeObject **v,
6138 const char *s, /* MBCS string */
6139 int size, /* sizeof MBCS string */
6140 int final,
6141 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006142{
6143 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006144 Py_ssize_t n;
6145 DWORD usize;
6146 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006147
6148 assert(size >= 0);
6149
Victor Stinner554f3f02010-06-16 23:33:54 +00006150 /* check and handle 'errors' arg */
6151 if (errors==NULL || strcmp(errors, "strict")==0)
6152 flags = MB_ERR_INVALID_CHARS;
6153 else if (strcmp(errors, "ignore")==0)
6154 flags = 0;
6155 else {
6156 PyErr_Format(PyExc_ValueError,
6157 "mbcs encoding does not support errors='%s'",
6158 errors);
6159 return -1;
6160 }
6161
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006162 /* Skip trailing lead-byte unless 'final' is set */
6163 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006165
6166 /* First get the size of the result */
6167 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006168 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6169 if (usize==0)
6170 goto mbcs_decode_error;
6171 } else
6172 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006173
6174 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 /* Create unicode object */
6176 *v = _PyUnicode_New(usize);
6177 if (*v == NULL)
6178 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006179 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006180 }
6181 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 /* Extend unicode object */
6183 n = PyUnicode_GET_SIZE(*v);
6184 if (_PyUnicode_Resize(v, n + usize) < 0)
6185 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006186 }
6187
6188 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006189 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006190 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006191 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6192 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006194 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006195 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006196
6197mbcs_decode_error:
6198 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6199 we raise a UnicodeDecodeError - else it is a 'generic'
6200 windows error
6201 */
6202 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6203 /* Ideally, we should get reason from FormatMessage - this
6204 is the Windows 2000 English version of the message
6205 */
6206 PyObject *exc = NULL;
6207 const char *reason = "No mapping for the Unicode character exists "
6208 "in the target multi-byte code page.";
6209 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6210 if (exc != NULL) {
6211 PyCodec_StrictErrors(exc);
6212 Py_DECREF(exc);
6213 }
6214 } else {
6215 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6216 }
6217 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006218}
6219
Alexander Belopolsky40018472011-02-26 01:02:56 +00006220PyObject *
6221PyUnicode_DecodeMBCSStateful(const char *s,
6222 Py_ssize_t size,
6223 const char *errors,
6224 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006225{
6226 PyUnicodeObject *v = NULL;
6227 int done;
6228
6229 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006231
6232#ifdef NEED_RETRY
6233 retry:
6234 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006235 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006236 else
6237#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006238 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006239
6240 if (done < 0) {
6241 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006243 }
6244
6245 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006247
6248#ifdef NEED_RETRY
6249 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 s += done;
6251 size -= done;
6252 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006253 }
6254#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006255 if (PyUnicode_READY(v) == -1) {
6256 Py_DECREF(v);
6257 return NULL;
6258 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006259 return (PyObject *)v;
6260}
6261
Alexander Belopolsky40018472011-02-26 01:02:56 +00006262PyObject *
6263PyUnicode_DecodeMBCS(const char *s,
6264 Py_ssize_t size,
6265 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006266{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006267 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6268}
6269
6270/*
6271 * Convert unicode into string object (MBCS).
6272 * Returns 0 if succeed, -1 otherwise.
6273 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006274static int
6275encode_mbcs(PyObject **repr,
6276 const Py_UNICODE *p, /* unicode */
6277 int size, /* size of unicode */
6278 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006279{
Victor Stinner554f3f02010-06-16 23:33:54 +00006280 BOOL usedDefaultChar = FALSE;
6281 BOOL *pusedDefaultChar;
6282 int mbcssize;
6283 Py_ssize_t n;
6284 PyObject *exc = NULL;
6285 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006286
6287 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006288
Victor Stinner554f3f02010-06-16 23:33:54 +00006289 /* check and handle 'errors' arg */
6290 if (errors==NULL || strcmp(errors, "strict")==0) {
6291 flags = WC_NO_BEST_FIT_CHARS;
6292 pusedDefaultChar = &usedDefaultChar;
6293 } else if (strcmp(errors, "replace")==0) {
6294 flags = 0;
6295 pusedDefaultChar = NULL;
6296 } else {
6297 PyErr_Format(PyExc_ValueError,
6298 "mbcs encoding does not support errors='%s'",
6299 errors);
6300 return -1;
6301 }
6302
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006303 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006304 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006305 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6306 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 if (mbcssize == 0) {
6308 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6309 return -1;
6310 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006311 /* If we used a default char, then we failed! */
6312 if (pusedDefaultChar && *pusedDefaultChar)
6313 goto mbcs_encode_error;
6314 } else {
6315 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006316 }
6317
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006318 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 /* Create string object */
6320 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6321 if (*repr == NULL)
6322 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006323 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006324 }
6325 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 /* Extend string object */
6327 n = PyBytes_Size(*repr);
6328 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6329 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006330 }
6331
6332 /* Do the conversion */
6333 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006335 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6336 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6338 return -1;
6339 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006340 if (pusedDefaultChar && *pusedDefaultChar)
6341 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006342 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006343 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006344
6345mbcs_encode_error:
6346 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6347 Py_XDECREF(exc);
6348 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006349}
6350
Alexander Belopolsky40018472011-02-26 01:02:56 +00006351PyObject *
6352PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6353 Py_ssize_t size,
6354 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006355{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006356 PyObject *repr = NULL;
6357 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006358
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006359#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006361 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006362 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006363 else
6364#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006365 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006366
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006367 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 Py_XDECREF(repr);
6369 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006370 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006371
6372#ifdef NEED_RETRY
6373 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 p += INT_MAX;
6375 size -= INT_MAX;
6376 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006377 }
6378#endif
6379
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006380 return repr;
6381}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006382
Alexander Belopolsky40018472011-02-26 01:02:56 +00006383PyObject *
6384PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006385{
6386 if (!PyUnicode_Check(unicode)) {
6387 PyErr_BadArgument();
6388 return NULL;
6389 }
6390 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 PyUnicode_GET_SIZE(unicode),
6392 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006393}
6394
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006395#undef NEED_RETRY
6396
Victor Stinner99b95382011-07-04 14:23:54 +02006397#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006398
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399/* --- Character Mapping Codec -------------------------------------------- */
6400
Alexander Belopolsky40018472011-02-26 01:02:56 +00006401PyObject *
6402PyUnicode_DecodeCharmap(const char *s,
6403 Py_ssize_t size,
6404 PyObject *mapping,
6405 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006408 Py_ssize_t startinpos;
6409 Py_ssize_t endinpos;
6410 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 PyUnicodeObject *v;
6413 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006414 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415 PyObject *errorHandler = NULL;
6416 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006417 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006418 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006419
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 /* Default to Latin-1 */
6421 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423
6424 v = _PyUnicode_New(size);
6425 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006431 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 mapstring = PyUnicode_AS_UNICODE(mapping);
6433 maplen = PyUnicode_GET_SIZE(mapping);
6434 while (s < e) {
6435 unsigned char ch = *s;
6436 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 if (ch < maplen)
6439 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 if (x == 0xfffe) {
6442 /* undefined mapping */
6443 outpos = p-PyUnicode_AS_UNICODE(v);
6444 startinpos = s-starts;
6445 endinpos = startinpos+1;
6446 if (unicode_decode_call_errorhandler(
6447 errors, &errorHandler,
6448 "charmap", "character maps to <undefined>",
6449 &starts, &e, &startinpos, &endinpos, &exc, &s,
6450 &v, &outpos, &p)) {
6451 goto onError;
6452 }
6453 continue;
6454 }
6455 *p++ = x;
6456 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006457 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006458 }
6459 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 while (s < e) {
6461 unsigned char ch = *s;
6462 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006463
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6465 w = PyLong_FromLong((long)ch);
6466 if (w == NULL)
6467 goto onError;
6468 x = PyObject_GetItem(mapping, w);
6469 Py_DECREF(w);
6470 if (x == NULL) {
6471 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6472 /* No mapping found means: mapping is undefined. */
6473 PyErr_Clear();
6474 x = Py_None;
6475 Py_INCREF(x);
6476 } else
6477 goto onError;
6478 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006479
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 /* Apply mapping */
6481 if (PyLong_Check(x)) {
6482 long value = PyLong_AS_LONG(x);
6483 if (value < 0 || value > 65535) {
6484 PyErr_SetString(PyExc_TypeError,
6485 "character mapping must be in range(65536)");
6486 Py_DECREF(x);
6487 goto onError;
6488 }
6489 *p++ = (Py_UNICODE)value;
6490 }
6491 else if (x == Py_None) {
6492 /* undefined mapping */
6493 outpos = p-PyUnicode_AS_UNICODE(v);
6494 startinpos = s-starts;
6495 endinpos = startinpos+1;
6496 if (unicode_decode_call_errorhandler(
6497 errors, &errorHandler,
6498 "charmap", "character maps to <undefined>",
6499 &starts, &e, &startinpos, &endinpos, &exc, &s,
6500 &v, &outpos, &p)) {
6501 Py_DECREF(x);
6502 goto onError;
6503 }
6504 Py_DECREF(x);
6505 continue;
6506 }
6507 else if (PyUnicode_Check(x)) {
6508 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006509
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 if (targetsize == 1)
6511 /* 1-1 mapping */
6512 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006513
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 else if (targetsize > 1) {
6515 /* 1-n mapping */
6516 if (targetsize > extrachars) {
6517 /* resize first */
6518 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6519 Py_ssize_t needed = (targetsize - extrachars) + \
6520 (targetsize << 2);
6521 extrachars += needed;
6522 /* XXX overflow detection missing */
6523 if (_PyUnicode_Resize(&v,
6524 PyUnicode_GET_SIZE(v) + needed) < 0) {
6525 Py_DECREF(x);
6526 goto onError;
6527 }
6528 p = PyUnicode_AS_UNICODE(v) + oldpos;
6529 }
6530 Py_UNICODE_COPY(p,
6531 PyUnicode_AS_UNICODE(x),
6532 targetsize);
6533 p += targetsize;
6534 extrachars -= targetsize;
6535 }
6536 /* 1-0 mapping: skip the character */
6537 }
6538 else {
6539 /* wrong return value */
6540 PyErr_SetString(PyExc_TypeError,
6541 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006542 Py_DECREF(x);
6543 goto onError;
6544 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 Py_DECREF(x);
6546 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 }
6549 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6551 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006552 Py_XDECREF(errorHandler);
6553 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006554 if (PyUnicode_READY(v) == -1) {
6555 Py_DECREF(v);
6556 return NULL;
6557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006559
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006561 Py_XDECREF(errorHandler);
6562 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563 Py_XDECREF(v);
6564 return NULL;
6565}
6566
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006567/* Charmap encoding: the lookup table */
6568
Alexander Belopolsky40018472011-02-26 01:02:56 +00006569struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 PyObject_HEAD
6571 unsigned char level1[32];
6572 int count2, count3;
6573 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006574};
6575
6576static PyObject*
6577encoding_map_size(PyObject *obj, PyObject* args)
6578{
6579 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006580 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006582}
6583
6584static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006585 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 PyDoc_STR("Return the size (in bytes) of this object") },
6587 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006588};
6589
6590static void
6591encoding_map_dealloc(PyObject* o)
6592{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006593 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006594}
6595
6596static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006597 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 "EncodingMap", /*tp_name*/
6599 sizeof(struct encoding_map), /*tp_basicsize*/
6600 0, /*tp_itemsize*/
6601 /* methods */
6602 encoding_map_dealloc, /*tp_dealloc*/
6603 0, /*tp_print*/
6604 0, /*tp_getattr*/
6605 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006606 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 0, /*tp_repr*/
6608 0, /*tp_as_number*/
6609 0, /*tp_as_sequence*/
6610 0, /*tp_as_mapping*/
6611 0, /*tp_hash*/
6612 0, /*tp_call*/
6613 0, /*tp_str*/
6614 0, /*tp_getattro*/
6615 0, /*tp_setattro*/
6616 0, /*tp_as_buffer*/
6617 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6618 0, /*tp_doc*/
6619 0, /*tp_traverse*/
6620 0, /*tp_clear*/
6621 0, /*tp_richcompare*/
6622 0, /*tp_weaklistoffset*/
6623 0, /*tp_iter*/
6624 0, /*tp_iternext*/
6625 encoding_map_methods, /*tp_methods*/
6626 0, /*tp_members*/
6627 0, /*tp_getset*/
6628 0, /*tp_base*/
6629 0, /*tp_dict*/
6630 0, /*tp_descr_get*/
6631 0, /*tp_descr_set*/
6632 0, /*tp_dictoffset*/
6633 0, /*tp_init*/
6634 0, /*tp_alloc*/
6635 0, /*tp_new*/
6636 0, /*tp_free*/
6637 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006638};
6639
6640PyObject*
6641PyUnicode_BuildEncodingMap(PyObject* string)
6642{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006643 PyObject *result;
6644 struct encoding_map *mresult;
6645 int i;
6646 int need_dict = 0;
6647 unsigned char level1[32];
6648 unsigned char level2[512];
6649 unsigned char *mlevel1, *mlevel2, *mlevel3;
6650 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006651 int kind;
6652 void *data;
6653 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006655 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006656 PyErr_BadArgument();
6657 return NULL;
6658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006659 kind = PyUnicode_KIND(string);
6660 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006661 memset(level1, 0xFF, sizeof level1);
6662 memset(level2, 0xFF, sizeof level2);
6663
6664 /* If there isn't a one-to-one mapping of NULL to \0,
6665 or if there are non-BMP characters, we need to use
6666 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006667 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006668 need_dict = 1;
6669 for (i = 1; i < 256; i++) {
6670 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006671 ch = PyUnicode_READ(kind, data, i);
6672 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006673 need_dict = 1;
6674 break;
6675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006676 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006677 /* unmapped character */
6678 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006679 l1 = ch >> 11;
6680 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006681 if (level1[l1] == 0xFF)
6682 level1[l1] = count2++;
6683 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006684 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006685 }
6686
6687 if (count2 >= 0xFF || count3 >= 0xFF)
6688 need_dict = 1;
6689
6690 if (need_dict) {
6691 PyObject *result = PyDict_New();
6692 PyObject *key, *value;
6693 if (!result)
6694 return NULL;
6695 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006696 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006697 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006698 if (!key || !value)
6699 goto failed1;
6700 if (PyDict_SetItem(result, key, value) == -1)
6701 goto failed1;
6702 Py_DECREF(key);
6703 Py_DECREF(value);
6704 }
6705 return result;
6706 failed1:
6707 Py_XDECREF(key);
6708 Py_XDECREF(value);
6709 Py_DECREF(result);
6710 return NULL;
6711 }
6712
6713 /* Create a three-level trie */
6714 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6715 16*count2 + 128*count3 - 1);
6716 if (!result)
6717 return PyErr_NoMemory();
6718 PyObject_Init(result, &EncodingMapType);
6719 mresult = (struct encoding_map*)result;
6720 mresult->count2 = count2;
6721 mresult->count3 = count3;
6722 mlevel1 = mresult->level1;
6723 mlevel2 = mresult->level23;
6724 mlevel3 = mresult->level23 + 16*count2;
6725 memcpy(mlevel1, level1, 32);
6726 memset(mlevel2, 0xFF, 16*count2);
6727 memset(mlevel3, 0, 128*count3);
6728 count3 = 0;
6729 for (i = 1; i < 256; i++) {
6730 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006731 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006732 /* unmapped character */
6733 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006734 o1 = PyUnicode_READ(kind, data, i)>>11;
6735 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006736 i2 = 16*mlevel1[o1] + o2;
6737 if (mlevel2[i2] == 0xFF)
6738 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006739 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006740 i3 = 128*mlevel2[i2] + o3;
6741 mlevel3[i3] = i;
6742 }
6743 return result;
6744}
6745
6746static int
6747encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6748{
6749 struct encoding_map *map = (struct encoding_map*)mapping;
6750 int l1 = c>>11;
6751 int l2 = (c>>7) & 0xF;
6752 int l3 = c & 0x7F;
6753 int i;
6754
6755#ifdef Py_UNICODE_WIDE
6756 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006758 }
6759#endif
6760 if (c == 0)
6761 return 0;
6762 /* level 1*/
6763 i = map->level1[l1];
6764 if (i == 0xFF) {
6765 return -1;
6766 }
6767 /* level 2*/
6768 i = map->level23[16*i+l2];
6769 if (i == 0xFF) {
6770 return -1;
6771 }
6772 /* level 3 */
6773 i = map->level23[16*map->count2 + 128*i + l3];
6774 if (i == 0) {
6775 return -1;
6776 }
6777 return i;
6778}
6779
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006780/* Lookup the character ch in the mapping. If the character
6781 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006782 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006783static PyObject *
6784charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785{
Christian Heimes217cfd12007-12-02 14:31:20 +00006786 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006787 PyObject *x;
6788
6789 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006791 x = PyObject_GetItem(mapping, w);
6792 Py_DECREF(w);
6793 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006794 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6795 /* No mapping found means: mapping is undefined. */
6796 PyErr_Clear();
6797 x = Py_None;
6798 Py_INCREF(x);
6799 return x;
6800 } else
6801 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006803 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006805 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 long value = PyLong_AS_LONG(x);
6807 if (value < 0 || value > 255) {
6808 PyErr_SetString(PyExc_TypeError,
6809 "character mapping must be in range(256)");
6810 Py_DECREF(x);
6811 return NULL;
6812 }
6813 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006815 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 /* wrong return value */
6819 PyErr_Format(PyExc_TypeError,
6820 "character mapping must return integer, bytes or None, not %.400s",
6821 x->ob_type->tp_name);
6822 Py_DECREF(x);
6823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 }
6825}
6826
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006827static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006828charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006829{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006830 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6831 /* exponentially overallocate to minimize reallocations */
6832 if (requiredsize < 2*outsize)
6833 requiredsize = 2*outsize;
6834 if (_PyBytes_Resize(outobj, requiredsize))
6835 return -1;
6836 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006837}
6838
Benjamin Peterson14339b62009-01-31 16:36:08 +00006839typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006841} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006842/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006843 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844 space is available. Return a new reference to the object that
6845 was put in the output buffer, or Py_None, if the mapping was undefined
6846 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006847 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006848static charmapencode_result
6849charmapencode_output(Py_UNICODE c, PyObject *mapping,
6850 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006851{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006852 PyObject *rep;
6853 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006854 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006855
Christian Heimes90aa7642007-12-19 02:45:37 +00006856 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006857 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006859 if (res == -1)
6860 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 if (outsize<requiredsize)
6862 if (charmapencode_resize(outobj, outpos, requiredsize))
6863 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006864 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 outstart[(*outpos)++] = (char)res;
6866 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006867 }
6868
6869 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006870 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006872 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 Py_DECREF(rep);
6874 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006875 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 if (PyLong_Check(rep)) {
6877 Py_ssize_t requiredsize = *outpos+1;
6878 if (outsize<requiredsize)
6879 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6880 Py_DECREF(rep);
6881 return enc_EXCEPTION;
6882 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006883 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006885 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 else {
6887 const char *repchars = PyBytes_AS_STRING(rep);
6888 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6889 Py_ssize_t requiredsize = *outpos+repsize;
6890 if (outsize<requiredsize)
6891 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6892 Py_DECREF(rep);
6893 return enc_EXCEPTION;
6894 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006895 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 memcpy(outstart + *outpos, repchars, repsize);
6897 *outpos += repsize;
6898 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006900 Py_DECREF(rep);
6901 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006902}
6903
6904/* handle an error in PyUnicode_EncodeCharmap
6905 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006906static int
6907charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006908 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006909 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006910 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006911 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006912{
6913 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006914 Py_ssize_t repsize;
6915 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006916 Py_UNICODE *uni2;
6917 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006918 Py_ssize_t collstartpos = *inpos;
6919 Py_ssize_t collendpos = *inpos+1;
6920 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006921 char *encoding = "charmap";
6922 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006923 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006924
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006925 /* find all unencodable characters */
6926 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006927 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006928 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 int res = encoding_map_lookup(p[collendpos], mapping);
6930 if (res != -1)
6931 break;
6932 ++collendpos;
6933 continue;
6934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006935
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 rep = charmapencode_lookup(p[collendpos], mapping);
6937 if (rep==NULL)
6938 return -1;
6939 else if (rep!=Py_None) {
6940 Py_DECREF(rep);
6941 break;
6942 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006943 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006945 }
6946 /* cache callback name lookup
6947 * (if not done yet, i.e. it's the first error) */
6948 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 if ((errors==NULL) || (!strcmp(errors, "strict")))
6950 *known_errorHandler = 1;
6951 else if (!strcmp(errors, "replace"))
6952 *known_errorHandler = 2;
6953 else if (!strcmp(errors, "ignore"))
6954 *known_errorHandler = 3;
6955 else if (!strcmp(errors, "xmlcharrefreplace"))
6956 *known_errorHandler = 4;
6957 else
6958 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006959 }
6960 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006961 case 1: /* strict */
6962 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6963 return -1;
6964 case 2: /* replace */
6965 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 x = charmapencode_output('?', mapping, res, respos);
6967 if (x==enc_EXCEPTION) {
6968 return -1;
6969 }
6970 else if (x==enc_FAILED) {
6971 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6972 return -1;
6973 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006974 }
6975 /* fall through */
6976 case 3: /* ignore */
6977 *inpos = collendpos;
6978 break;
6979 case 4: /* xmlcharrefreplace */
6980 /* generate replacement (temporarily (mis)uses p) */
6981 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 char buffer[2+29+1+1];
6983 char *cp;
6984 sprintf(buffer, "&#%d;", (int)p[collpos]);
6985 for (cp = buffer; *cp; ++cp) {
6986 x = charmapencode_output(*cp, mapping, res, respos);
6987 if (x==enc_EXCEPTION)
6988 return -1;
6989 else if (x==enc_FAILED) {
6990 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6991 return -1;
6992 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006993 }
6994 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006995 *inpos = collendpos;
6996 break;
6997 default:
6998 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 encoding, reason, p, size, exceptionObject,
7000 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007001 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007002 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007003 if (PyBytes_Check(repunicode)) {
7004 /* Directly copy bytes result to output. */
7005 Py_ssize_t outsize = PyBytes_Size(*res);
7006 Py_ssize_t requiredsize;
7007 repsize = PyBytes_Size(repunicode);
7008 requiredsize = *respos + repsize;
7009 if (requiredsize > outsize)
7010 /* Make room for all additional bytes. */
7011 if (charmapencode_resize(res, respos, requiredsize)) {
7012 Py_DECREF(repunicode);
7013 return -1;
7014 }
7015 memcpy(PyBytes_AsString(*res) + *respos,
7016 PyBytes_AsString(repunicode), repsize);
7017 *respos += repsize;
7018 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007019 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007020 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007021 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007022 /* generate replacement */
7023 repsize = PyUnicode_GET_SIZE(repunicode);
7024 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 x = charmapencode_output(*uni2, mapping, res, respos);
7026 if (x==enc_EXCEPTION) {
7027 return -1;
7028 }
7029 else if (x==enc_FAILED) {
7030 Py_DECREF(repunicode);
7031 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7032 return -1;
7033 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007034 }
7035 *inpos = newpos;
7036 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007037 }
7038 return 0;
7039}
7040
Alexander Belopolsky40018472011-02-26 01:02:56 +00007041PyObject *
7042PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7043 Py_ssize_t size,
7044 PyObject *mapping,
7045 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007047 /* output object */
7048 PyObject *res = NULL;
7049 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007050 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007051 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007052 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007053 PyObject *errorHandler = NULL;
7054 PyObject *exc = NULL;
7055 /* the following variable is used for caching string comparisons
7056 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7057 * 3=ignore, 4=xmlcharrefreplace */
7058 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059
7060 /* Default to Latin-1 */
7061 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007064 /* allocate enough for a simple encoding without
7065 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007066 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007067 if (res == NULL)
7068 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007069 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007070 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007072 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007073 /* try to encode it */
7074 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7075 if (x==enc_EXCEPTION) /* error */
7076 goto onError;
7077 if (x==enc_FAILED) { /* unencodable character */
7078 if (charmap_encoding_error(p, size, &inpos, mapping,
7079 &exc,
7080 &known_errorHandler, &errorHandler, errors,
7081 &res, &respos)) {
7082 goto onError;
7083 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007084 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 else
7086 /* done with this character => adjust input position */
7087 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007090 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007091 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007092 if (_PyBytes_Resize(&res, respos) < 0)
7093 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007094
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007095 Py_XDECREF(exc);
7096 Py_XDECREF(errorHandler);
7097 return res;
7098
Benjamin Peterson29060642009-01-31 22:14:21 +00007099 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007100 Py_XDECREF(res);
7101 Py_XDECREF(exc);
7102 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 return NULL;
7104}
7105
Alexander Belopolsky40018472011-02-26 01:02:56 +00007106PyObject *
7107PyUnicode_AsCharmapString(PyObject *unicode,
7108 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109{
7110 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007111 PyErr_BadArgument();
7112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113 }
7114 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 PyUnicode_GET_SIZE(unicode),
7116 mapping,
7117 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118}
7119
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007120/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007121static void
7122make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007123 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007124 Py_ssize_t startpos, Py_ssize_t endpos,
7125 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007127 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007128 *exceptionObject = _PyUnicodeTranslateError_Create(
7129 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 }
7131 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7133 goto onError;
7134 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7135 goto onError;
7136 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7137 goto onError;
7138 return;
7139 onError:
7140 Py_DECREF(*exceptionObject);
7141 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 }
7143}
7144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007145/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007146static void
7147raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007148 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007149 Py_ssize_t startpos, Py_ssize_t endpos,
7150 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007151{
7152 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007153 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007154 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007155 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007156}
7157
7158/* error handling callback helper:
7159 build arguments, call the callback and check the arguments,
7160 put the result into newpos and return the replacement string, which
7161 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007162static PyObject *
7163unicode_translate_call_errorhandler(const char *errors,
7164 PyObject **errorHandler,
7165 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007166 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007167 Py_ssize_t startpos, Py_ssize_t endpos,
7168 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007169{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007170 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007171
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007172 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007173 PyObject *restuple;
7174 PyObject *resunicode;
7175
7176 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007178 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007179 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007180 }
7181
7182 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007183 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007184 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007185 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007186
7187 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007189 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007191 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007192 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007193 Py_DECREF(restuple);
7194 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007195 }
7196 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 &resunicode, &i_newpos)) {
7198 Py_DECREF(restuple);
7199 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007200 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007201 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007202 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007203 else
7204 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007205 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7207 Py_DECREF(restuple);
7208 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007209 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007210 Py_INCREF(resunicode);
7211 Py_DECREF(restuple);
7212 return resunicode;
7213}
7214
7215/* Lookup the character ch in the mapping and put the result in result,
7216 which must be decrefed by the caller.
7217 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007218static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007219charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007220{
Christian Heimes217cfd12007-12-02 14:31:20 +00007221 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007222 PyObject *x;
7223
7224 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007226 x = PyObject_GetItem(mapping, w);
7227 Py_DECREF(w);
7228 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7230 /* No mapping found means: use 1:1 mapping. */
7231 PyErr_Clear();
7232 *result = NULL;
7233 return 0;
7234 } else
7235 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007236 }
7237 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 *result = x;
7239 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007240 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007241 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 long value = PyLong_AS_LONG(x);
7243 long max = PyUnicode_GetMax();
7244 if (value < 0 || value > max) {
7245 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007246 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 Py_DECREF(x);
7248 return -1;
7249 }
7250 *result = x;
7251 return 0;
7252 }
7253 else if (PyUnicode_Check(x)) {
7254 *result = x;
7255 return 0;
7256 }
7257 else {
7258 /* wrong return value */
7259 PyErr_SetString(PyExc_TypeError,
7260 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007261 Py_DECREF(x);
7262 return -1;
7263 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007264}
7265/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 if not reallocate and adjust various state variables.
7267 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007268static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007269charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007271{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007272 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007273 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007274 /* exponentially overallocate to minimize reallocations */
7275 if (requiredsize < 2 * oldsize)
7276 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007277 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7278 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007280 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007281 }
7282 return 0;
7283}
7284/* lookup the character, put the result in the output string and adjust
7285 various state variables. Return a new reference to the object that
7286 was put in the output buffer in *result, or Py_None, if the mapping was
7287 undefined (in which case no character was written).
7288 The called must decref result.
7289 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007290static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007291charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7292 PyObject *mapping, Py_UCS4 **output,
7293 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007294 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007295{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007296 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7297 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007299 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007301 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007302 }
7303 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007305 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007307 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007308 }
7309 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007310 Py_ssize_t repsize;
7311 if (PyUnicode_READY(*res) == -1)
7312 return -1;
7313 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 if (repsize==1) {
7315 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007316 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 }
7318 else if (repsize!=0) {
7319 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007320 Py_ssize_t requiredsize = *opos +
7321 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007323 Py_ssize_t i;
7324 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007326 for(i = 0; i < repsize; i++)
7327 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007329 }
7330 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007332 return 0;
7333}
7334
Alexander Belopolsky40018472011-02-26 01:02:56 +00007335PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007336_PyUnicode_TranslateCharmap(PyObject *input,
7337 PyObject *mapping,
7338 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007340 /* input object */
7341 char *idata;
7342 Py_ssize_t size, i;
7343 int kind;
7344 /* output buffer */
7345 Py_UCS4 *output = NULL;
7346 Py_ssize_t osize;
7347 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007348 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007349 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007350 char *reason = "character maps to <undefined>";
7351 PyObject *errorHandler = NULL;
7352 PyObject *exc = NULL;
7353 /* the following variable is used for caching string comparisons
7354 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7355 * 3=ignore, 4=xmlcharrefreplace */
7356 int known_errorHandler = -1;
7357
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 PyErr_BadArgument();
7360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007363 if (PyUnicode_READY(input) == -1)
7364 return NULL;
7365 idata = (char*)PyUnicode_DATA(input);
7366 kind = PyUnicode_KIND(input);
7367 size = PyUnicode_GET_LENGTH(input);
7368 i = 0;
7369
7370 if (size == 0) {
7371 Py_INCREF(input);
7372 return input;
7373 }
7374
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007375 /* allocate enough for a simple 1:1 translation without
7376 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007377 osize = size;
7378 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7379 opos = 0;
7380 if (output == NULL) {
7381 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007383 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007385 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 /* try to encode it */
7387 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007388 if (charmaptranslate_output(input, i, mapping,
7389 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 Py_XDECREF(x);
7391 goto onError;
7392 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007393 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007395 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 else { /* untranslatable character */
7397 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7398 Py_ssize_t repsize;
7399 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007400 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007402 Py_ssize_t collstart = i;
7403 Py_ssize_t collend = i+1;
7404 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007407 while (collend < size) {
7408 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 goto onError;
7410 Py_XDECREF(x);
7411 if (x!=Py_None)
7412 break;
7413 ++collend;
7414 }
7415 /* cache callback name lookup
7416 * (if not done yet, i.e. it's the first error) */
7417 if (known_errorHandler==-1) {
7418 if ((errors==NULL) || (!strcmp(errors, "strict")))
7419 known_errorHandler = 1;
7420 else if (!strcmp(errors, "replace"))
7421 known_errorHandler = 2;
7422 else if (!strcmp(errors, "ignore"))
7423 known_errorHandler = 3;
7424 else if (!strcmp(errors, "xmlcharrefreplace"))
7425 known_errorHandler = 4;
7426 else
7427 known_errorHandler = 0;
7428 }
7429 switch (known_errorHandler) {
7430 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007431 raise_translate_exception(&exc, input, collstart,
7432 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007433 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 case 2: /* replace */
7435 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007436 for (coll = collstart; coll<collend; coll++)
7437 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 /* fall through */
7439 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007440 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 break;
7442 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007443 /* generate replacement (temporarily (mis)uses i) */
7444 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 char buffer[2+29+1+1];
7446 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007447 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7448 if (charmaptranslate_makespace(&output, &osize,
7449 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 goto onError;
7451 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007452 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007454 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 break;
7456 default:
7457 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007458 reason, input, &exc,
7459 collstart, collend, &newpos);
7460 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 goto onError;
7462 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007463 repsize = PyUnicode_GET_LENGTH(repunicode);
7464 if (charmaptranslate_makespace(&output, &osize,
7465 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 Py_DECREF(repunicode);
7467 goto onError;
7468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007469 for (uni2 = 0; repsize-->0; ++uni2)
7470 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7471 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007473 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007474 }
7475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007476 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7477 if (!res)
7478 goto onError;
7479 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007480 Py_XDECREF(exc);
7481 Py_XDECREF(errorHandler);
7482 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007485 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007486 Py_XDECREF(exc);
7487 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 return NULL;
7489}
7490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007491/* Deprecated. Use PyUnicode_Translate instead. */
7492PyObject *
7493PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7494 Py_ssize_t size,
7495 PyObject *mapping,
7496 const char *errors)
7497{
7498 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7499 if (!unicode)
7500 return NULL;
7501 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7502}
7503
Alexander Belopolsky40018472011-02-26 01:02:56 +00007504PyObject *
7505PyUnicode_Translate(PyObject *str,
7506 PyObject *mapping,
7507 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508{
7509 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007510
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 str = PyUnicode_FromObject(str);
7512 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007513 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007514 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 Py_DECREF(str);
7516 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007517
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519 Py_XDECREF(str);
7520 return NULL;
7521}
Tim Petersced69f82003-09-16 20:30:58 +00007522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007523static Py_UCS4
7524fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7525{
7526 /* No need to call PyUnicode_READY(self) because this function is only
7527 called as a callback from fixup() which does it already. */
7528 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7529 const int kind = PyUnicode_KIND(self);
7530 void *data = PyUnicode_DATA(self);
7531 Py_UCS4 maxchar = 0, ch, fixed;
7532 Py_ssize_t i;
7533
7534 for (i = 0; i < len; ++i) {
7535 ch = PyUnicode_READ(kind, data, i);
7536 fixed = 0;
7537 if (ch > 127) {
7538 if (Py_UNICODE_ISSPACE(ch))
7539 fixed = ' ';
7540 else {
7541 const int decimal = Py_UNICODE_TODECIMAL(ch);
7542 if (decimal >= 0)
7543 fixed = '0' + decimal;
7544 }
7545 if (fixed != 0) {
7546 if (fixed > maxchar)
7547 maxchar = fixed;
7548 PyUnicode_WRITE(kind, data, i, fixed);
7549 }
7550 else if (ch > maxchar)
7551 maxchar = ch;
7552 }
7553 else if (ch > maxchar)
7554 maxchar = ch;
7555 }
7556
7557 return maxchar;
7558}
7559
7560PyObject *
7561_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7562{
7563 if (!PyUnicode_Check(unicode)) {
7564 PyErr_BadInternalCall();
7565 return NULL;
7566 }
7567 if (PyUnicode_READY(unicode) == -1)
7568 return NULL;
7569 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7570 /* If the string is already ASCII, just return the same string */
7571 Py_INCREF(unicode);
7572 return unicode;
7573 }
7574 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7575}
7576
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007577PyObject *
7578PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7579 Py_ssize_t length)
7580{
7581 PyObject *result;
7582 Py_UNICODE *p; /* write pointer into result */
7583 Py_ssize_t i;
7584 /* Copy to a new string */
7585 result = (PyObject *)_PyUnicode_New(length);
7586 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7587 if (result == NULL)
7588 return result;
7589 p = PyUnicode_AS_UNICODE(result);
7590 /* Iterate over code points */
7591 for (i = 0; i < length; i++) {
7592 Py_UNICODE ch =s[i];
7593 if (ch > 127) {
7594 int decimal = Py_UNICODE_TODECIMAL(ch);
7595 if (decimal >= 0)
7596 p[i] = '0' + decimal;
7597 }
7598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007599 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7600 Py_DECREF(result);
7601 return NULL;
7602 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007603 return result;
7604}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007605/* --- Decimal Encoder ---------------------------------------------------- */
7606
Alexander Belopolsky40018472011-02-26 01:02:56 +00007607int
7608PyUnicode_EncodeDecimal(Py_UNICODE *s,
7609 Py_ssize_t length,
7610 char *output,
7611 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007612{
7613 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007614 PyObject *errorHandler = NULL;
7615 PyObject *exc = NULL;
7616 const char *encoding = "decimal";
7617 const char *reason = "invalid decimal Unicode string";
7618 /* the following variable is used for caching string comparisons
7619 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7620 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007621
7622 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 PyErr_BadArgument();
7624 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007625 }
7626
7627 p = s;
7628 end = s + length;
7629 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 register Py_UNICODE ch = *p;
7631 int decimal;
7632 PyObject *repunicode;
7633 Py_ssize_t repsize;
7634 Py_ssize_t newpos;
7635 Py_UNICODE *uni2;
7636 Py_UNICODE *collstart;
7637 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007638
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007640 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 ++p;
7642 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007643 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 decimal = Py_UNICODE_TODECIMAL(ch);
7645 if (decimal >= 0) {
7646 *output++ = '0' + decimal;
7647 ++p;
7648 continue;
7649 }
7650 if (0 < ch && ch < 256) {
7651 *output++ = (char)ch;
7652 ++p;
7653 continue;
7654 }
7655 /* All other characters are considered unencodable */
7656 collstart = p;
7657 collend = p+1;
7658 while (collend < end) {
7659 if ((0 < *collend && *collend < 256) ||
7660 !Py_UNICODE_ISSPACE(*collend) ||
7661 Py_UNICODE_TODECIMAL(*collend))
7662 break;
7663 }
7664 /* cache callback name lookup
7665 * (if not done yet, i.e. it's the first error) */
7666 if (known_errorHandler==-1) {
7667 if ((errors==NULL) || (!strcmp(errors, "strict")))
7668 known_errorHandler = 1;
7669 else if (!strcmp(errors, "replace"))
7670 known_errorHandler = 2;
7671 else if (!strcmp(errors, "ignore"))
7672 known_errorHandler = 3;
7673 else if (!strcmp(errors, "xmlcharrefreplace"))
7674 known_errorHandler = 4;
7675 else
7676 known_errorHandler = 0;
7677 }
7678 switch (known_errorHandler) {
7679 case 1: /* strict */
7680 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7681 goto onError;
7682 case 2: /* replace */
7683 for (p = collstart; p < collend; ++p)
7684 *output++ = '?';
7685 /* fall through */
7686 case 3: /* ignore */
7687 p = collend;
7688 break;
7689 case 4: /* xmlcharrefreplace */
7690 /* generate replacement (temporarily (mis)uses p) */
7691 for (p = collstart; p < collend; ++p)
7692 output += sprintf(output, "&#%d;", (int)*p);
7693 p = collend;
7694 break;
7695 default:
7696 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7697 encoding, reason, s, length, &exc,
7698 collstart-s, collend-s, &newpos);
7699 if (repunicode == NULL)
7700 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007701 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007702 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007703 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7704 Py_DECREF(repunicode);
7705 goto onError;
7706 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 /* generate replacement */
7708 repsize = PyUnicode_GET_SIZE(repunicode);
7709 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7710 Py_UNICODE ch = *uni2;
7711 if (Py_UNICODE_ISSPACE(ch))
7712 *output++ = ' ';
7713 else {
7714 decimal = Py_UNICODE_TODECIMAL(ch);
7715 if (decimal >= 0)
7716 *output++ = '0' + decimal;
7717 else if (0 < ch && ch < 256)
7718 *output++ = (char)ch;
7719 else {
7720 Py_DECREF(repunicode);
7721 raise_encode_exception(&exc, encoding,
7722 s, length, collstart-s, collend-s, reason);
7723 goto onError;
7724 }
7725 }
7726 }
7727 p = s + newpos;
7728 Py_DECREF(repunicode);
7729 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007730 }
7731 /* 0-terminate the output string */
7732 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007733 Py_XDECREF(exc);
7734 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007735 return 0;
7736
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007738 Py_XDECREF(exc);
7739 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007740 return -1;
7741}
7742
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743/* --- Helpers ------------------------------------------------------------ */
7744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007745#include "stringlib/ucs1lib.h"
7746#include "stringlib/fastsearch.h"
7747#include "stringlib/partition.h"
7748#include "stringlib/split.h"
7749#include "stringlib/count.h"
7750#include "stringlib/find.h"
7751#include "stringlib/localeutil.h"
7752#include "stringlib/undef.h"
7753
7754#include "stringlib/ucs2lib.h"
7755#include "stringlib/fastsearch.h"
7756#include "stringlib/partition.h"
7757#include "stringlib/split.h"
7758#include "stringlib/count.h"
7759#include "stringlib/find.h"
7760#include "stringlib/localeutil.h"
7761#include "stringlib/undef.h"
7762
7763#include "stringlib/ucs4lib.h"
7764#include "stringlib/fastsearch.h"
7765#include "stringlib/partition.h"
7766#include "stringlib/split.h"
7767#include "stringlib/count.h"
7768#include "stringlib/find.h"
7769#include "stringlib/localeutil.h"
7770#include "stringlib/undef.h"
7771
7772static Py_ssize_t
7773any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7774 const Py_UCS1*, Py_ssize_t,
7775 Py_ssize_t, Py_ssize_t),
7776 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7777 const Py_UCS2*, Py_ssize_t,
7778 Py_ssize_t, Py_ssize_t),
7779 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7780 const Py_UCS4*, Py_ssize_t,
7781 Py_ssize_t, Py_ssize_t),
7782 PyObject* s1, PyObject* s2,
7783 Py_ssize_t start,
7784 Py_ssize_t end)
7785{
7786 int kind1, kind2, kind;
7787 void *buf1, *buf2;
7788 Py_ssize_t len1, len2, result;
7789
7790 kind1 = PyUnicode_KIND(s1);
7791 kind2 = PyUnicode_KIND(s2);
7792 kind = kind1 > kind2 ? kind1 : kind2;
7793 buf1 = PyUnicode_DATA(s1);
7794 buf2 = PyUnicode_DATA(s2);
7795 if (kind1 != kind)
7796 buf1 = _PyUnicode_AsKind(s1, kind);
7797 if (!buf1)
7798 return -2;
7799 if (kind2 != kind)
7800 buf2 = _PyUnicode_AsKind(s2, kind);
7801 if (!buf2) {
7802 if (kind1 != kind) PyMem_Free(buf1);
7803 return -2;
7804 }
7805 len1 = PyUnicode_GET_LENGTH(s1);
7806 len2 = PyUnicode_GET_LENGTH(s2);
7807
7808 switch(kind) {
7809 case PyUnicode_1BYTE_KIND:
7810 result = ucs1(buf1, len1, buf2, len2, start, end);
7811 break;
7812 case PyUnicode_2BYTE_KIND:
7813 result = ucs2(buf1, len1, buf2, len2, start, end);
7814 break;
7815 case PyUnicode_4BYTE_KIND:
7816 result = ucs4(buf1, len1, buf2, len2, start, end);
7817 break;
7818 default:
7819 assert(0); result = -2;
7820 }
7821
7822 if (kind1 != kind)
7823 PyMem_Free(buf1);
7824 if (kind2 != kind)
7825 PyMem_Free(buf2);
7826
7827 return result;
7828}
7829
7830Py_ssize_t
7831_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7832 Py_ssize_t n_buffer,
7833 void *digits, Py_ssize_t n_digits,
7834 Py_ssize_t min_width,
7835 const char *grouping,
7836 const char *thousands_sep)
7837{
7838 switch(kind) {
7839 case PyUnicode_1BYTE_KIND:
7840 return _PyUnicode_ucs1_InsertThousandsGrouping(
7841 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7842 min_width, grouping, thousands_sep);
7843 case PyUnicode_2BYTE_KIND:
7844 return _PyUnicode_ucs2_InsertThousandsGrouping(
7845 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7846 min_width, grouping, thousands_sep);
7847 case PyUnicode_4BYTE_KIND:
7848 return _PyUnicode_ucs4_InsertThousandsGrouping(
7849 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7850 min_width, grouping, thousands_sep);
7851 }
7852 assert(0);
7853 return -1;
7854}
7855
7856
Eric Smith8c663262007-08-25 02:26:07 +00007857#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007858#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007859
Thomas Wouters477c8d52006-05-27 19:21:47 +00007860#include "stringlib/count.h"
7861#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007862
Thomas Wouters477c8d52006-05-27 19:21:47 +00007863/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007864#define ADJUST_INDICES(start, end, len) \
7865 if (end > len) \
7866 end = len; \
7867 else if (end < 0) { \
7868 end += len; \
7869 if (end < 0) \
7870 end = 0; \
7871 } \
7872 if (start < 0) { \
7873 start += len; \
7874 if (start < 0) \
7875 start = 0; \
7876 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007877
Alexander Belopolsky40018472011-02-26 01:02:56 +00007878Py_ssize_t
7879PyUnicode_Count(PyObject *str,
7880 PyObject *substr,
7881 Py_ssize_t start,
7882 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007884 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007885 PyUnicodeObject* str_obj;
7886 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007887 int kind1, kind2, kind;
7888 void *buf1 = NULL, *buf2 = NULL;
7889 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007890
Thomas Wouters477c8d52006-05-27 19:21:47 +00007891 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007892 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007894 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007895 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 Py_DECREF(str_obj);
7897 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898 }
Tim Petersced69f82003-09-16 20:30:58 +00007899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007900 kind1 = PyUnicode_KIND(str_obj);
7901 kind2 = PyUnicode_KIND(sub_obj);
7902 kind = kind1 > kind2 ? kind1 : kind2;
7903 buf1 = PyUnicode_DATA(str_obj);
7904 if (kind1 != kind)
7905 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7906 if (!buf1)
7907 goto onError;
7908 buf2 = PyUnicode_DATA(sub_obj);
7909 if (kind2 != kind)
7910 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7911 if (!buf2)
7912 goto onError;
7913 len1 = PyUnicode_GET_LENGTH(str_obj);
7914 len2 = PyUnicode_GET_LENGTH(sub_obj);
7915
7916 ADJUST_INDICES(start, end, len1);
7917 switch(kind) {
7918 case PyUnicode_1BYTE_KIND:
7919 result = ucs1lib_count(
7920 ((Py_UCS1*)buf1) + start, end - start,
7921 buf2, len2, PY_SSIZE_T_MAX
7922 );
7923 break;
7924 case PyUnicode_2BYTE_KIND:
7925 result = ucs2lib_count(
7926 ((Py_UCS2*)buf1) + start, end - start,
7927 buf2, len2, PY_SSIZE_T_MAX
7928 );
7929 break;
7930 case PyUnicode_4BYTE_KIND:
7931 result = ucs4lib_count(
7932 ((Py_UCS4*)buf1) + start, end - start,
7933 buf2, len2, PY_SSIZE_T_MAX
7934 );
7935 break;
7936 default:
7937 assert(0); result = 0;
7938 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007939
7940 Py_DECREF(sub_obj);
7941 Py_DECREF(str_obj);
7942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007943 if (kind1 != kind)
7944 PyMem_Free(buf1);
7945 if (kind2 != kind)
7946 PyMem_Free(buf2);
7947
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007949 onError:
7950 Py_DECREF(sub_obj);
7951 Py_DECREF(str_obj);
7952 if (kind1 != kind && buf1)
7953 PyMem_Free(buf1);
7954 if (kind2 != kind && buf2)
7955 PyMem_Free(buf2);
7956 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957}
7958
Alexander Belopolsky40018472011-02-26 01:02:56 +00007959Py_ssize_t
7960PyUnicode_Find(PyObject *str,
7961 PyObject *sub,
7962 Py_ssize_t start,
7963 Py_ssize_t end,
7964 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007966 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00007967
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007969 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007971 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007972 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 Py_DECREF(str);
7974 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 }
Tim Petersced69f82003-09-16 20:30:58 +00007976
Thomas Wouters477c8d52006-05-27 19:21:47 +00007977 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007978 result = any_find_slice(
7979 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
7980 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007981 );
7982 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007983 result = any_find_slice(
7984 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
7985 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007986 );
7987
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007989 Py_DECREF(sub);
7990
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 return result;
7992}
7993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007994Py_ssize_t
7995PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
7996 Py_ssize_t start, Py_ssize_t end,
7997 int direction)
7998{
7999 char *result;
8000 int kind;
8001 if (PyUnicode_READY(str) == -1)
8002 return -2;
8003 if (end > PyUnicode_GET_LENGTH(str))
8004 end = PyUnicode_GET_LENGTH(str);
8005 kind = PyUnicode_KIND(str);
8006 result = findchar(PyUnicode_1BYTE_DATA(str)
8007 + PyUnicode_KIND_SIZE(kind, start),
8008 kind,
8009 end-start, ch, direction);
8010 if (!result)
8011 return -1;
8012 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8013}
8014
Alexander Belopolsky40018472011-02-26 01:02:56 +00008015static int
8016tailmatch(PyUnicodeObject *self,
8017 PyUnicodeObject *substring,
8018 Py_ssize_t start,
8019 Py_ssize_t end,
8020 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008022 int kind_self;
8023 int kind_sub;
8024 void *data_self;
8025 void *data_sub;
8026 Py_ssize_t offset;
8027 Py_ssize_t i;
8028 Py_ssize_t end_sub;
8029
8030 if (PyUnicode_READY(self) == -1 ||
8031 PyUnicode_READY(substring) == -1)
8032 return 0;
8033
8034 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 return 1;
8036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008037 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8038 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008042 kind_self = PyUnicode_KIND(self);
8043 data_self = PyUnicode_DATA(self);
8044 kind_sub = PyUnicode_KIND(substring);
8045 data_sub = PyUnicode_DATA(substring);
8046 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8047
8048 if (direction > 0)
8049 offset = end;
8050 else
8051 offset = start;
8052
8053 if (PyUnicode_READ(kind_self, data_self, offset) ==
8054 PyUnicode_READ(kind_sub, data_sub, 0) &&
8055 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8056 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8057 /* If both are of the same kind, memcmp is sufficient */
8058 if (kind_self == kind_sub) {
8059 return ! memcmp((char *)data_self +
8060 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8061 data_sub,
8062 PyUnicode_GET_LENGTH(substring) *
8063 PyUnicode_CHARACTER_SIZE(substring));
8064 }
8065 /* otherwise we have to compare each character by first accesing it */
8066 else {
8067 /* We do not need to compare 0 and len(substring)-1 because
8068 the if statement above ensured already that they are equal
8069 when we end up here. */
8070 // TODO: honor direction and do a forward or backwards search
8071 for (i = 1; i < end_sub; ++i) {
8072 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8073 PyUnicode_READ(kind_sub, data_sub, i))
8074 return 0;
8075 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078 }
8079
8080 return 0;
8081}
8082
Alexander Belopolsky40018472011-02-26 01:02:56 +00008083Py_ssize_t
8084PyUnicode_Tailmatch(PyObject *str,
8085 PyObject *substr,
8086 Py_ssize_t start,
8087 Py_ssize_t end,
8088 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008090 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008091
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092 str = PyUnicode_FromObject(str);
8093 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095 substr = PyUnicode_FromObject(substr);
8096 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 Py_DECREF(str);
8098 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099 }
Tim Petersced69f82003-09-16 20:30:58 +00008100
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 (PyUnicodeObject *)substr,
8103 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104 Py_DECREF(str);
8105 Py_DECREF(substr);
8106 return result;
8107}
8108
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109/* Apply fixfct filter to the Unicode object self and return a
8110 reference to the modified object */
8111
Alexander Belopolsky40018472011-02-26 01:02:56 +00008112static PyObject *
8113fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008114 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008116 PyObject *u;
8117 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 if (PyUnicode_READY(self) == -1)
8120 return NULL;
8121 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8122 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8123 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008127 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8128 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008130 /* fix functions return the new maximum character in a string,
8131 if the kind of the resulting unicode object does not change,
8132 everything is fine. Otherwise we need to change the string kind
8133 and re-run the fix function. */
8134 maxchar_new = fixfct((PyUnicodeObject*)u);
8135 if (maxchar_new == 0)
8136 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8137 else if (maxchar_new <= 127)
8138 maxchar_new = 127;
8139 else if (maxchar_new <= 255)
8140 maxchar_new = 255;
8141 else if (maxchar_new <= 65535)
8142 maxchar_new = 65535;
8143 else
8144 maxchar_new = 1114111; /* 0x10ffff */
8145
8146 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 /* fixfct should return TRUE if it modified the buffer. If
8148 FALSE, return a reference to the original buffer instead
8149 (to save space, not time) */
8150 Py_INCREF(self);
8151 Py_DECREF(u);
8152 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008154 else if (maxchar_new == maxchar_old) {
8155 return u;
8156 }
8157 else {
8158 /* In case the maximum character changed, we need to
8159 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008160 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161 if (v == NULL) {
8162 Py_DECREF(u);
8163 return NULL;
8164 }
8165 if (maxchar_new > maxchar_old) {
8166 /* If the maxchar increased so that the kind changed, not all
8167 characters are representable anymore and we need to fix the
8168 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008169 if (PyUnicode_CopyCharacters(v, 0,
8170 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008171 PyUnicode_GET_LENGTH(self)) < 0)
8172 {
8173 Py_DECREF(u);
8174 return NULL;
8175 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008176 maxchar_old = fixfct((PyUnicodeObject*)v);
8177 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8178 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008179 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008180 if (PyUnicode_CopyCharacters(v, 0,
8181 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008182 PyUnicode_GET_LENGTH(self)) < 0)
8183 {
8184 Py_DECREF(u);
8185 return NULL;
8186 }
8187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008188
8189 Py_DECREF(u);
8190 return v;
8191 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192}
8193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008194static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008195fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008197 /* No need to call PyUnicode_READY(self) because this function is only
8198 called as a callback from fixup() which does it already. */
8199 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8200 const int kind = PyUnicode_KIND(self);
8201 void *data = PyUnicode_DATA(self);
8202 int touched = 0;
8203 Py_UCS4 maxchar = 0;
8204 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008206 for (i = 0; i < len; ++i) {
8207 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8208 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8209 if (up != ch) {
8210 if (up > maxchar)
8211 maxchar = up;
8212 PyUnicode_WRITE(kind, data, i, up);
8213 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008215 else if (ch > maxchar)
8216 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217 }
8218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008219 if (touched)
8220 return maxchar;
8221 else
8222 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223}
8224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008225static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008226fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008228 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8229 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8230 const int kind = PyUnicode_KIND(self);
8231 void *data = PyUnicode_DATA(self);
8232 int touched = 0;
8233 Py_UCS4 maxchar = 0;
8234 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008236 for(i = 0; i < len; ++i) {
8237 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8238 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8239 if (lo != ch) {
8240 if (lo > maxchar)
8241 maxchar = lo;
8242 PyUnicode_WRITE(kind, data, i, lo);
8243 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008245 else if (ch > maxchar)
8246 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 }
8248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008249 if (touched)
8250 return maxchar;
8251 else
8252 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253}
8254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008255static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008256fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8259 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8260 const int kind = PyUnicode_KIND(self);
8261 void *data = PyUnicode_DATA(self);
8262 int touched = 0;
8263 Py_UCS4 maxchar = 0;
8264 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008266 for(i = 0; i < len; ++i) {
8267 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8268 Py_UCS4 nu = 0;
8269
8270 if (Py_UNICODE_ISUPPER(ch))
8271 nu = Py_UNICODE_TOLOWER(ch);
8272 else if (Py_UNICODE_ISLOWER(ch))
8273 nu = Py_UNICODE_TOUPPER(ch);
8274
8275 if (nu != 0) {
8276 if (nu > maxchar)
8277 maxchar = nu;
8278 PyUnicode_WRITE(kind, data, i, nu);
8279 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008281 else if (ch > maxchar)
8282 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 }
8284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008285 if (touched)
8286 return maxchar;
8287 else
8288 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289}
8290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008291static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008292fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008294 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8295 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8296 const int kind = PyUnicode_KIND(self);
8297 void *data = PyUnicode_DATA(self);
8298 int touched = 0;
8299 Py_UCS4 maxchar = 0;
8300 Py_ssize_t i = 0;
8301 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008302
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008303 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008305
8306 ch = PyUnicode_READ(kind, data, i);
8307 if (!Py_UNICODE_ISUPPER(ch)) {
8308 maxchar = Py_UNICODE_TOUPPER(ch);
8309 PyUnicode_WRITE(kind, data, i, maxchar);
8310 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312 ++i;
8313 for(; i < len; ++i) {
8314 ch = PyUnicode_READ(kind, data, i);
8315 if (!Py_UNICODE_ISLOWER(ch)) {
8316 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8317 if (lo > maxchar)
8318 maxchar = lo;
8319 PyUnicode_WRITE(kind, data, i, lo);
8320 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008322 else if (ch > maxchar)
8323 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008324 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008325
8326 if (touched)
8327 return maxchar;
8328 else
8329 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330}
8331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8336 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8337 const int kind = PyUnicode_KIND(self);
8338 void *data = PyUnicode_DATA(self);
8339 Py_UCS4 maxchar = 0;
8340 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 int previous_is_cased;
8342
8343 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344 if (len == 1) {
8345 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8346 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8347 if (ti != ch) {
8348 PyUnicode_WRITE(kind, data, i, ti);
8349 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 }
8351 else
8352 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 for(; i < len; ++i) {
8356 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8357 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008358
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362 nu = Py_UNICODE_TOTITLE(ch);
8363
8364 if (nu > maxchar)
8365 maxchar = nu;
8366 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008367
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 if (Py_UNICODE_ISLOWER(ch) ||
8369 Py_UNICODE_ISUPPER(ch) ||
8370 Py_UNICODE_ISTITLE(ch))
8371 previous_is_cased = 1;
8372 else
8373 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376}
8377
Tim Peters8ce9f162004-08-27 01:49:32 +00008378PyObject *
8379PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008382 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008384 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008385 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8386 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008387 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 Py_ssize_t sz, i, res_offset;
8389 Py_UCS4 maxchar = 0;
8390 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391
Tim Peters05eba1f2004-08-27 21:32:02 +00008392 fseq = PySequence_Fast(seq, "");
8393 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008394 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008395 }
8396
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008397 /* NOTE: the following code can't call back into Python code,
8398 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008399 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008400
Tim Peters05eba1f2004-08-27 21:32:02 +00008401 seqlen = PySequence_Fast_GET_SIZE(fseq);
8402 /* If empty sequence, return u"". */
8403 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008405 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008406 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008407 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008408 /* If singleton sequence with an exact Unicode, return that. */
8409 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 item = items[0];
8411 if (PyUnicode_CheckExact(item)) {
8412 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008413 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 goto Done;
8415 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008416 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008417 else {
8418 /* Set up sep and seplen */
8419 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008420 /* fall back to a blank space separator */
8421 sep = PyUnicode_FromOrdinal(' ');
8422 if (!sep || PyUnicode_READY(sep) == -1)
8423 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008424 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008425 else {
8426 if (!PyUnicode_Check(separator)) {
8427 PyErr_Format(PyExc_TypeError,
8428 "separator: expected str instance,"
8429 " %.80s found",
8430 Py_TYPE(separator)->tp_name);
8431 goto onError;
8432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 if (PyUnicode_READY(separator) == -1)
8434 goto onError;
8435 sep = separator;
8436 seplen = PyUnicode_GET_LENGTH(separator);
8437 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8438 /* inc refcount to keep this code path symetric with the
8439 above case of a blank separator */
8440 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008441 }
8442 }
8443
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008444 /* There are at least two things to join, or else we have a subclass
8445 * of str in the sequence.
8446 * Do a pre-pass to figure out the total amount of space we'll
8447 * need (sz), and see whether all argument are strings.
8448 */
8449 sz = 0;
8450 for (i = 0; i < seqlen; i++) {
8451 const Py_ssize_t old_sz = sz;
8452 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 if (!PyUnicode_Check(item)) {
8454 PyErr_Format(PyExc_TypeError,
8455 "sequence item %zd: expected str instance,"
8456 " %.80s found",
8457 i, Py_TYPE(item)->tp_name);
8458 goto onError;
8459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 if (PyUnicode_READY(item) == -1)
8461 goto onError;
8462 sz += PyUnicode_GET_LENGTH(item);
8463 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8464 if (item_maxchar > maxchar)
8465 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008466 if (i != 0)
8467 sz += seplen;
8468 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8469 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008471 goto onError;
8472 }
8473 }
Tim Petersced69f82003-09-16 20:30:58 +00008474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008475 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008476 if (res == NULL)
8477 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008478
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008479 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008481 Py_ssize_t itemlen;
8482 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008483 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 /* Copy item, and maybe the separator. */
8485 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008486 if (PyUnicode_CopyCharacters(res, res_offset,
8487 sep, 0, seplen) < 0)
8488 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008491 if (PyUnicode_CopyCharacters(res, res_offset,
8492 item, 0, itemlen) < 0)
8493 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008494 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008497
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008499 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008500 Py_XDECREF(sep);
8501 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008504 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008506 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 return NULL;
8508}
8509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510#define FILL(kind, data, value, start, length) \
8511 do { \
8512 Py_ssize_t i_ = 0; \
8513 assert(kind != PyUnicode_WCHAR_KIND); \
8514 switch ((kind)) { \
8515 case PyUnicode_1BYTE_KIND: { \
8516 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8517 memset(to_, (unsigned char)value, length); \
8518 break; \
8519 } \
8520 case PyUnicode_2BYTE_KIND: { \
8521 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8522 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8523 break; \
8524 } \
8525 default: { \
8526 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8527 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8528 break; \
8529 } \
8530 } \
8531 } while (0)
8532
Alexander Belopolsky40018472011-02-26 01:02:56 +00008533static PyUnicodeObject *
8534pad(PyUnicodeObject *self,
8535 Py_ssize_t left,
8536 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539 PyObject *u;
8540 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008541 int kind;
8542 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543
8544 if (left < 0)
8545 left = 0;
8546 if (right < 0)
8547 right = 0;
8548
Tim Peters7a29bd52001-09-12 03:03:31 +00008549 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550 Py_INCREF(self);
8551 return self;
8552 }
8553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8555 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008556 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8557 return NULL;
8558 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8560 if (fill > maxchar)
8561 maxchar = fill;
8562 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008563 if (!u)
8564 return NULL;
8565
8566 kind = PyUnicode_KIND(u);
8567 data = PyUnicode_DATA(u);
8568 if (left)
8569 FILL(kind, data, fill, 0, left);
8570 if (right)
8571 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008572 if (PyUnicode_CopyCharacters(u, left,
8573 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008574 _PyUnicode_LENGTH(self)) < 0)
8575 {
8576 Py_DECREF(u);
8577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 }
8579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583
Alexander Belopolsky40018472011-02-26 01:02:56 +00008584PyObject *
8585PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
8589 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 switch(PyUnicode_KIND(string)) {
8594 case PyUnicode_1BYTE_KIND:
8595 list = ucs1lib_splitlines(
8596 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8597 PyUnicode_GET_LENGTH(string), keepends);
8598 break;
8599 case PyUnicode_2BYTE_KIND:
8600 list = ucs2lib_splitlines(
8601 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8602 PyUnicode_GET_LENGTH(string), keepends);
8603 break;
8604 case PyUnicode_4BYTE_KIND:
8605 list = ucs4lib_splitlines(
8606 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8607 PyUnicode_GET_LENGTH(string), keepends);
8608 break;
8609 default:
8610 assert(0);
8611 list = 0;
8612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 Py_DECREF(string);
8614 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615}
8616
Alexander Belopolsky40018472011-02-26 01:02:56 +00008617static PyObject *
8618split(PyUnicodeObject *self,
8619 PyUnicodeObject *substring,
8620 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 int kind1, kind2, kind;
8623 void *buf1, *buf2;
8624 Py_ssize_t len1, len2;
8625 PyObject* out;
8626
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008628 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 if (PyUnicode_READY(self) == -1)
8631 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 if (substring == NULL)
8634 switch(PyUnicode_KIND(self)) {
8635 case PyUnicode_1BYTE_KIND:
8636 return ucs1lib_split_whitespace(
8637 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8638 PyUnicode_GET_LENGTH(self), maxcount
8639 );
8640 case PyUnicode_2BYTE_KIND:
8641 return ucs2lib_split_whitespace(
8642 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8643 PyUnicode_GET_LENGTH(self), maxcount
8644 );
8645 case PyUnicode_4BYTE_KIND:
8646 return ucs4lib_split_whitespace(
8647 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8648 PyUnicode_GET_LENGTH(self), maxcount
8649 );
8650 default:
8651 assert(0);
8652 return NULL;
8653 }
8654
8655 if (PyUnicode_READY(substring) == -1)
8656 return NULL;
8657
8658 kind1 = PyUnicode_KIND(self);
8659 kind2 = PyUnicode_KIND(substring);
8660 kind = kind1 > kind2 ? kind1 : kind2;
8661 buf1 = PyUnicode_DATA(self);
8662 buf2 = PyUnicode_DATA(substring);
8663 if (kind1 != kind)
8664 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8665 if (!buf1)
8666 return NULL;
8667 if (kind2 != kind)
8668 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8669 if (!buf2) {
8670 if (kind1 != kind) PyMem_Free(buf1);
8671 return NULL;
8672 }
8673 len1 = PyUnicode_GET_LENGTH(self);
8674 len2 = PyUnicode_GET_LENGTH(substring);
8675
8676 switch(kind) {
8677 case PyUnicode_1BYTE_KIND:
8678 out = ucs1lib_split(
8679 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8680 break;
8681 case PyUnicode_2BYTE_KIND:
8682 out = ucs2lib_split(
8683 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8684 break;
8685 case PyUnicode_4BYTE_KIND:
8686 out = ucs4lib_split(
8687 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8688 break;
8689 default:
8690 out = NULL;
8691 }
8692 if (kind1 != kind)
8693 PyMem_Free(buf1);
8694 if (kind2 != kind)
8695 PyMem_Free(buf2);
8696 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697}
8698
Alexander Belopolsky40018472011-02-26 01:02:56 +00008699static PyObject *
8700rsplit(PyUnicodeObject *self,
8701 PyUnicodeObject *substring,
8702 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008703{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 int kind1, kind2, kind;
8705 void *buf1, *buf2;
8706 Py_ssize_t len1, len2;
8707 PyObject* out;
8708
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008709 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008710 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 if (PyUnicode_READY(self) == -1)
8713 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 if (substring == NULL)
8716 switch(PyUnicode_KIND(self)) {
8717 case PyUnicode_1BYTE_KIND:
8718 return ucs1lib_rsplit_whitespace(
8719 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8720 PyUnicode_GET_LENGTH(self), maxcount
8721 );
8722 case PyUnicode_2BYTE_KIND:
8723 return ucs2lib_rsplit_whitespace(
8724 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8725 PyUnicode_GET_LENGTH(self), maxcount
8726 );
8727 case PyUnicode_4BYTE_KIND:
8728 return ucs4lib_rsplit_whitespace(
8729 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8730 PyUnicode_GET_LENGTH(self), maxcount
8731 );
8732 default:
8733 assert(0);
8734 return NULL;
8735 }
8736
8737 if (PyUnicode_READY(substring) == -1)
8738 return NULL;
8739
8740 kind1 = PyUnicode_KIND(self);
8741 kind2 = PyUnicode_KIND(substring);
8742 kind = kind1 > kind2 ? kind1 : kind2;
8743 buf1 = PyUnicode_DATA(self);
8744 buf2 = PyUnicode_DATA(substring);
8745 if (kind1 != kind)
8746 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8747 if (!buf1)
8748 return NULL;
8749 if (kind2 != kind)
8750 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8751 if (!buf2) {
8752 if (kind1 != kind) PyMem_Free(buf1);
8753 return NULL;
8754 }
8755 len1 = PyUnicode_GET_LENGTH(self);
8756 len2 = PyUnicode_GET_LENGTH(substring);
8757
8758 switch(kind) {
8759 case PyUnicode_1BYTE_KIND:
8760 out = ucs1lib_rsplit(
8761 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8762 break;
8763 case PyUnicode_2BYTE_KIND:
8764 out = ucs2lib_rsplit(
8765 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8766 break;
8767 case PyUnicode_4BYTE_KIND:
8768 out = ucs4lib_rsplit(
8769 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8770 break;
8771 default:
8772 out = NULL;
8773 }
8774 if (kind1 != kind)
8775 PyMem_Free(buf1);
8776 if (kind2 != kind)
8777 PyMem_Free(buf2);
8778 return out;
8779}
8780
8781static Py_ssize_t
8782anylib_find(int kind, void *buf1, Py_ssize_t len1,
8783 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8784{
8785 switch(kind) {
8786 case PyUnicode_1BYTE_KIND:
8787 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8788 case PyUnicode_2BYTE_KIND:
8789 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8790 case PyUnicode_4BYTE_KIND:
8791 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8792 }
8793 assert(0);
8794 return -1;
8795}
8796
8797static Py_ssize_t
8798anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8799 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8800{
8801 switch(kind) {
8802 case PyUnicode_1BYTE_KIND:
8803 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8804 case PyUnicode_2BYTE_KIND:
8805 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8806 case PyUnicode_4BYTE_KIND:
8807 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8808 }
8809 assert(0);
8810 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008811}
8812
Alexander Belopolsky40018472011-02-26 01:02:56 +00008813static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814replace(PyObject *self, PyObject *str1,
8815 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 PyObject *u;
8818 char *sbuf = PyUnicode_DATA(self);
8819 char *buf1 = PyUnicode_DATA(str1);
8820 char *buf2 = PyUnicode_DATA(str2);
8821 int srelease = 0, release1 = 0, release2 = 0;
8822 int skind = PyUnicode_KIND(self);
8823 int kind1 = PyUnicode_KIND(str1);
8824 int kind2 = PyUnicode_KIND(str2);
8825 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8826 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8827 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828
8829 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008832 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 if (skind < kind1)
8835 /* substring too wide to be present */
8836 goto nothing;
8837
8838 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008839 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008840 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008842 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008843 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008844 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 Py_UCS4 u1, u2, maxchar;
8846 int mayshrink, rkind;
8847 u1 = PyUnicode_READ_CHAR(str1, 0);
8848 if (!findchar(sbuf, PyUnicode_KIND(self),
8849 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008850 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 u2 = PyUnicode_READ_CHAR(str2, 0);
8852 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8853 /* Replacing u1 with u2 may cause a maxchar reduction in the
8854 result string. */
8855 mayshrink = maxchar > 127;
8856 if (u2 > maxchar) {
8857 maxchar = u2;
8858 mayshrink = 0;
8859 }
8860 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008861 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008863 if (PyUnicode_CopyCharacters(u, 0,
8864 (PyObject*)self, 0, slen) < 0)
8865 {
8866 Py_DECREF(u);
8867 return NULL;
8868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008869 rkind = PyUnicode_KIND(u);
8870 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8871 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008872 if (--maxcount < 0)
8873 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008875 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 if (mayshrink) {
8877 PyObject *tmp = u;
8878 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8879 PyUnicode_GET_LENGTH(tmp));
8880 Py_DECREF(tmp);
8881 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 int rkind = skind;
8884 char *res;
8885 if (kind1 < rkind) {
8886 /* widen substring */
8887 buf1 = _PyUnicode_AsKind(str1, rkind);
8888 if (!buf1) goto error;
8889 release1 = 1;
8890 }
8891 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008892 if (i < 0)
8893 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008894 if (rkind > kind2) {
8895 /* widen replacement */
8896 buf2 = _PyUnicode_AsKind(str2, rkind);
8897 if (!buf2) goto error;
8898 release2 = 1;
8899 }
8900 else if (rkind < kind2) {
8901 /* widen self and buf1 */
8902 rkind = kind2;
8903 if (release1) PyMem_Free(buf1);
8904 sbuf = _PyUnicode_AsKind(self, rkind);
8905 if (!sbuf) goto error;
8906 srelease = 1;
8907 buf1 = _PyUnicode_AsKind(str1, rkind);
8908 if (!buf1) goto error;
8909 release1 = 1;
8910 }
8911 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8912 if (!res) {
8913 PyErr_NoMemory();
8914 goto error;
8915 }
8916 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008917 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8919 buf2,
8920 PyUnicode_KIND_SIZE(rkind, len2));
8921 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008922
8923 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8925 slen-i,
8926 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008927 if (i == -1)
8928 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8930 buf2,
8931 PyUnicode_KIND_SIZE(rkind, len2));
8932 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934
8935 u = PyUnicode_FromKindAndData(rkind, res, slen);
8936 PyMem_Free(res);
8937 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 Py_ssize_t n, i, j, ires;
8942 Py_ssize_t product, new_size;
8943 int rkind = skind;
8944 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 if (kind1 < rkind) {
8947 buf1 = _PyUnicode_AsKind(str1, rkind);
8948 if (!buf1) goto error;
8949 release1 = 1;
8950 }
8951 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008952 if (n == 0)
8953 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 if (kind2 < rkind) {
8955 buf2 = _PyUnicode_AsKind(str2, rkind);
8956 if (!buf2) goto error;
8957 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959 else if (kind2 > rkind) {
8960 rkind = kind2;
8961 sbuf = _PyUnicode_AsKind(self, rkind);
8962 if (!sbuf) goto error;
8963 srelease = 1;
8964 if (release1) PyMem_Free(buf1);
8965 buf1 = _PyUnicode_AsKind(str1, rkind);
8966 if (!buf1) goto error;
8967 release1 = 1;
8968 }
8969 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
8970 PyUnicode_GET_LENGTH(str1))); */
8971 product = n * (len2-len1);
8972 if ((product / (len2-len1)) != n) {
8973 PyErr_SetString(PyExc_OverflowError,
8974 "replace string is too long");
8975 goto error;
8976 }
8977 new_size = slen + product;
8978 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
8979 PyErr_SetString(PyExc_OverflowError,
8980 "replace string is too long");
8981 goto error;
8982 }
8983 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
8984 if (!res)
8985 goto error;
8986 ires = i = 0;
8987 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008988 while (n-- > 0) {
8989 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990 j = anylib_find(rkind,
8991 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8992 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008993 if (j == -1)
8994 break;
8995 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008996 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8998 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8999 PyUnicode_KIND_SIZE(rkind, j-i));
9000 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009001 }
9002 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 if (len2 > 0) {
9004 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9005 buf2,
9006 PyUnicode_KIND_SIZE(rkind, len2));
9007 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009012 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9014 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9015 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009016 } else {
9017 /* interleave */
9018 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009019 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9020 buf2,
9021 PyUnicode_KIND_SIZE(rkind, len2));
9022 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009023 if (--n <= 0)
9024 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9026 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9027 PyUnicode_KIND_SIZE(rkind, 1));
9028 ires++;
9029 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9032 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9033 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009037 if (srelease)
9038 PyMem_FREE(sbuf);
9039 if (release1)
9040 PyMem_FREE(buf1);
9041 if (release2)
9042 PyMem_FREE(buf2);
9043 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009044
Benjamin Peterson29060642009-01-31 22:14:21 +00009045 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009046 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047 if (srelease)
9048 PyMem_FREE(sbuf);
9049 if (release1)
9050 PyMem_FREE(buf1);
9051 if (release2)
9052 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009053 if (PyUnicode_CheckExact(self)) {
9054 Py_INCREF(self);
9055 return (PyObject *) self;
9056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 return PyUnicode_FromKindAndData(PyUnicode_KIND(self),
9058 PyUnicode_DATA(self),
9059 PyUnicode_GET_LENGTH(self));
9060 error:
9061 if (srelease && sbuf)
9062 PyMem_FREE(sbuf);
9063 if (release1 && buf1)
9064 PyMem_FREE(buf1);
9065 if (release2 && buf2)
9066 PyMem_FREE(buf2);
9067 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068}
9069
9070/* --- Unicode Object Methods --------------------------------------------- */
9071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009072PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009073 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074\n\
9075Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009076characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077
9078static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009079unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081 return fixup(self, fixtitle);
9082}
9083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009084PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086\n\
9087Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009088have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089
9090static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009091unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093 return fixup(self, fixcapitalize);
9094}
9095
9096#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009097PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009098 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099\n\
9100Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009101normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102
9103static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009104unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009105{
9106 PyObject *list;
9107 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009108 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110 /* Split into words */
9111 list = split(self, NULL, -1);
9112 if (!list)
9113 return NULL;
9114
9115 /* Capitalize each word */
9116 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9117 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009118 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119 if (item == NULL)
9120 goto onError;
9121 Py_DECREF(PyList_GET_ITEM(list, i));
9122 PyList_SET_ITEM(list, i, item);
9123 }
9124
9125 /* Join the words to form a new string */
9126 item = PyUnicode_Join(NULL, list);
9127
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 Py_DECREF(list);
9130 return (PyObject *)item;
9131}
9132#endif
9133
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009134/* Argument converter. Coerces to a single unicode character */
9135
9136static int
9137convert_uc(PyObject *obj, void *addr)
9138{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009140 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009141
Benjamin Peterson14339b62009-01-31 16:36:08 +00009142 uniobj = PyUnicode_FromObject(obj);
9143 if (uniobj == NULL) {
9144 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009145 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009146 return 0;
9147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009149 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009151 Py_DECREF(uniobj);
9152 return 0;
9153 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 if (PyUnicode_READY(uniobj)) {
9155 Py_DECREF(uniobj);
9156 return 0;
9157 }
9158 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009159 Py_DECREF(uniobj);
9160 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009161}
9162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009163PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009166Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009167done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168
9169static PyObject *
9170unicode_center(PyUnicodeObject *self, PyObject *args)
9171{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009172 Py_ssize_t marg, left;
9173 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 Py_UCS4 fillchar = ' ';
9175
9176 if (PyUnicode_READY(self) == -1)
9177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178
Thomas Woutersde017742006-02-16 19:34:37 +00009179 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180 return NULL;
9181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183 Py_INCREF(self);
9184 return (PyObject*) self;
9185 }
9186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009188 left = marg / 2 + (marg & width & 1);
9189
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009190 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009191}
9192
Marc-André Lemburge5034372000-08-08 08:04:29 +00009193#if 0
9194
9195/* This code should go into some future Unicode collation support
9196 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009197 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009198
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009199/* speedy UTF-16 code point order comparison */
9200/* gleaned from: */
9201/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9202
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009203static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009204{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009205 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009206 0, 0, 0, 0, 0, 0, 0, 0,
9207 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009208 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009209};
9210
Guido van Rossumd57fd912000-03-10 22:53:23 +00009211static int
9212unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9213{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009214 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009215
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216 Py_UNICODE *s1 = str1->str;
9217 Py_UNICODE *s2 = str2->str;
9218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 len1 = str1->_base._base.length;
9220 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009221
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009223 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009224
9225 c1 = *s1++;
9226 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009227
Benjamin Peterson29060642009-01-31 22:14:21 +00009228 if (c1 > (1<<11) * 26)
9229 c1 += utf16Fixup[c1>>11];
9230 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009231 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009232 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009233
9234 if (c1 != c2)
9235 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009236
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009237 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238 }
9239
9240 return (len1 < len2) ? -1 : (len1 != len2);
9241}
9242
Marc-André Lemburge5034372000-08-08 08:04:29 +00009243#else
9244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245/* This function assumes that str1 and str2 are readied by the caller. */
9246
Marc-André Lemburge5034372000-08-08 08:04:29 +00009247static int
9248unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9249{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250 int kind1, kind2;
9251 void *data1, *data2;
9252 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 kind1 = PyUnicode_KIND(str1);
9255 kind2 = PyUnicode_KIND(str2);
9256 data1 = PyUnicode_DATA(str1);
9257 data2 = PyUnicode_DATA(str2);
9258 len1 = PyUnicode_GET_LENGTH(str1);
9259 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 for (i = 0; i < len1 && i < len2; ++i) {
9262 Py_UCS4 c1, c2;
9263 c1 = PyUnicode_READ(kind1, data1, i);
9264 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009265
9266 if (c1 != c2)
9267 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009268 }
9269
9270 return (len1 < len2) ? -1 : (len1 != len2);
9271}
9272
9273#endif
9274
Alexander Belopolsky40018472011-02-26 01:02:56 +00009275int
9276PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009277{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9279 if (PyUnicode_READY(left) == -1 ||
9280 PyUnicode_READY(right) == -1)
9281 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009282 return unicode_compare((PyUnicodeObject *)left,
9283 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009284 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009285 PyErr_Format(PyExc_TypeError,
9286 "Can't compare %.100s and %.100s",
9287 left->ob_type->tp_name,
9288 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289 return -1;
9290}
9291
Martin v. Löwis5b222132007-06-10 09:51:05 +00009292int
9293PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9294{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 Py_ssize_t i;
9296 int kind;
9297 void *data;
9298 Py_UCS4 chr;
9299
Martin v. Löwis5b222132007-06-10 09:51:05 +00009300 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 if (PyUnicode_READY(uni) == -1)
9302 return -1;
9303 kind = PyUnicode_KIND(uni);
9304 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009305 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9307 if (chr != str[i])
9308 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009309 /* This check keeps Python strings that end in '\0' from comparing equal
9310 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009312 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009313 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009314 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009315 return 0;
9316}
9317
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009318
Benjamin Peterson29060642009-01-31 22:14:21 +00009319#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009320 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009321
Alexander Belopolsky40018472011-02-26 01:02:56 +00009322PyObject *
9323PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009324{
9325 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009326
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009327 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9328 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 if (PyUnicode_READY(left) == -1 ||
9330 PyUnicode_READY(right) == -1)
9331 return NULL;
9332 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9333 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009334 if (op == Py_EQ) {
9335 Py_INCREF(Py_False);
9336 return Py_False;
9337 }
9338 if (op == Py_NE) {
9339 Py_INCREF(Py_True);
9340 return Py_True;
9341 }
9342 }
9343 if (left == right)
9344 result = 0;
9345 else
9346 result = unicode_compare((PyUnicodeObject *)left,
9347 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009348
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009349 /* Convert the return value to a Boolean */
9350 switch (op) {
9351 case Py_EQ:
9352 v = TEST_COND(result == 0);
9353 break;
9354 case Py_NE:
9355 v = TEST_COND(result != 0);
9356 break;
9357 case Py_LE:
9358 v = TEST_COND(result <= 0);
9359 break;
9360 case Py_GE:
9361 v = TEST_COND(result >= 0);
9362 break;
9363 case Py_LT:
9364 v = TEST_COND(result == -1);
9365 break;
9366 case Py_GT:
9367 v = TEST_COND(result == 1);
9368 break;
9369 default:
9370 PyErr_BadArgument();
9371 return NULL;
9372 }
9373 Py_INCREF(v);
9374 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009375 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009376
Brian Curtindfc80e32011-08-10 20:28:54 -05009377 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009378}
9379
Alexander Belopolsky40018472011-02-26 01:02:56 +00009380int
9381PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009382{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009383 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 int kind1, kind2, kind;
9385 void *buf1, *buf2;
9386 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009387 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009388
9389 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009390 sub = PyUnicode_FromObject(element);
9391 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009392 PyErr_Format(PyExc_TypeError,
9393 "'in <string>' requires string as left operand, not %s",
9394 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009395 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009396 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009397 if (PyUnicode_READY(sub) == -1)
9398 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009399
Thomas Wouters477c8d52006-05-27 19:21:47 +00009400 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009402 Py_DECREF(sub);
9403 return -1;
9404 }
9405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 kind1 = PyUnicode_KIND(str);
9407 kind2 = PyUnicode_KIND(sub);
9408 kind = kind1 > kind2 ? kind1 : kind2;
9409 buf1 = PyUnicode_DATA(str);
9410 buf2 = PyUnicode_DATA(sub);
9411 if (kind1 != kind)
9412 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9413 if (!buf1) {
9414 Py_DECREF(sub);
9415 return -1;
9416 }
9417 if (kind2 != kind)
9418 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9419 if (!buf2) {
9420 Py_DECREF(sub);
9421 if (kind1 != kind) PyMem_Free(buf1);
9422 return -1;
9423 }
9424 len1 = PyUnicode_GET_LENGTH(str);
9425 len2 = PyUnicode_GET_LENGTH(sub);
9426
9427 switch(kind) {
9428 case PyUnicode_1BYTE_KIND:
9429 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9430 break;
9431 case PyUnicode_2BYTE_KIND:
9432 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9433 break;
9434 case PyUnicode_4BYTE_KIND:
9435 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9436 break;
9437 default:
9438 result = -1;
9439 assert(0);
9440 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009441
9442 Py_DECREF(str);
9443 Py_DECREF(sub);
9444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 if (kind1 != kind)
9446 PyMem_Free(buf1);
9447 if (kind2 != kind)
9448 PyMem_Free(buf2);
9449
Guido van Rossum403d68b2000-03-13 15:55:09 +00009450 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009451}
9452
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453/* Concat to string or Unicode object giving a new Unicode object. */
9454
Alexander Belopolsky40018472011-02-26 01:02:56 +00009455PyObject *
9456PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 PyObject *u = NULL, *v = NULL, *w;
9459 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009460
9461 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009464 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009467 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009468
9469 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009471 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477 }
9478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9480 goto onError;
9481
9482 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009483 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 w = PyUnicode_New(
9487 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9488 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009490 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009491 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9492 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009493 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009494 v, 0,
9495 PyUnicode_GET_LENGTH(v)) < 0)
9496 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497 Py_DECREF(u);
9498 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 Py_XDECREF(u);
9503 Py_XDECREF(v);
9504 return NULL;
9505}
9506
Walter Dörwald1ab83302007-05-18 17:15:44 +00009507void
9508PyUnicode_Append(PyObject **pleft, PyObject *right)
9509{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009510 PyObject *new;
9511 if (*pleft == NULL)
9512 return;
9513 if (right == NULL || !PyUnicode_Check(*pleft)) {
9514 Py_DECREF(*pleft);
9515 *pleft = NULL;
9516 return;
9517 }
9518 new = PyUnicode_Concat(*pleft, right);
9519 Py_DECREF(*pleft);
9520 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009521}
9522
9523void
9524PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9525{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009526 PyUnicode_Append(pleft, right);
9527 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009528}
9529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009530PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009531 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009533Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009534string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009535interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536
9537static PyObject *
9538unicode_count(PyUnicodeObject *self, PyObject *args)
9539{
9540 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009541 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009542 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 int kind1, kind2, kind;
9545 void *buf1, *buf2;
9546 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547
Jesus Ceaac451502011-04-20 17:09:23 +02009548 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9549 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 kind1 = PyUnicode_KIND(self);
9553 kind2 = PyUnicode_KIND(substring);
9554 kind = kind1 > kind2 ? kind1 : kind2;
9555 buf1 = PyUnicode_DATA(self);
9556 buf2 = PyUnicode_DATA(substring);
9557 if (kind1 != kind)
9558 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9559 if (!buf1) {
9560 Py_DECREF(substring);
9561 return NULL;
9562 }
9563 if (kind2 != kind)
9564 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9565 if (!buf2) {
9566 Py_DECREF(substring);
9567 if (kind1 != kind) PyMem_Free(buf1);
9568 return NULL;
9569 }
9570 len1 = PyUnicode_GET_LENGTH(self);
9571 len2 = PyUnicode_GET_LENGTH(substring);
9572
9573 ADJUST_INDICES(start, end, len1);
9574 switch(kind) {
9575 case PyUnicode_1BYTE_KIND:
9576 iresult = ucs1lib_count(
9577 ((Py_UCS1*)buf1) + start, end - start,
9578 buf2, len2, PY_SSIZE_T_MAX
9579 );
9580 break;
9581 case PyUnicode_2BYTE_KIND:
9582 iresult = ucs2lib_count(
9583 ((Py_UCS2*)buf1) + start, end - start,
9584 buf2, len2, PY_SSIZE_T_MAX
9585 );
9586 break;
9587 case PyUnicode_4BYTE_KIND:
9588 iresult = ucs4lib_count(
9589 ((Py_UCS4*)buf1) + start, end - start,
9590 buf2, len2, PY_SSIZE_T_MAX
9591 );
9592 break;
9593 default:
9594 assert(0); iresult = 0;
9595 }
9596
9597 result = PyLong_FromSsize_t(iresult);
9598
9599 if (kind1 != kind)
9600 PyMem_Free(buf1);
9601 if (kind2 != kind)
9602 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603
9604 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009605
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606 return result;
9607}
9608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009609PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009610 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009612Encode S using the codec registered for encoding. Default encoding\n\
9613is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009614handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009615a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9616'xmlcharrefreplace' as well as any other name registered with\n\
9617codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009618
9619static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009620unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009621{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009622 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623 char *encoding = NULL;
9624 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009625
Benjamin Peterson308d6372009-09-18 21:42:35 +00009626 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9627 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009629 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009630}
9631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009632PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009633 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634\n\
9635Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009636If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637
9638static PyObject*
9639unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9640{
9641 Py_UNICODE *e;
9642 Py_UNICODE *p;
9643 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009644 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646 PyUnicodeObject *u;
9647 int tabsize = 8;
9648
9649 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9653 return NULL;
9654
Thomas Wouters7e474022000-07-16 12:04:32 +00009655 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009656 i = 0; /* chars up to and including most recent \n or \r */
9657 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9659 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009661 if (tabsize > 0) {
9662 incr = tabsize - (j % tabsize); /* cannot overflow */
9663 if (j > PY_SSIZE_T_MAX - incr)
9664 goto overflow1;
9665 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009666 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009669 if (j > PY_SSIZE_T_MAX - 1)
9670 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671 j++;
9672 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009673 if (i > PY_SSIZE_T_MAX - j)
9674 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009676 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677 }
9678 }
9679
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009680 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009681 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009682
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683 /* Second pass: create output string and fill it */
9684 u = _PyUnicode_New(i + j);
9685 if (!u)
9686 return NULL;
9687
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009688 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 q = _PyUnicode_WSTR(u); /* next output char */
9690 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009694 if (tabsize > 0) {
9695 i = tabsize - (j % tabsize);
9696 j += i;
9697 while (i--) {
9698 if (q >= qe)
9699 goto overflow2;
9700 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009701 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009702 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009703 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009704 else {
9705 if (q >= qe)
9706 goto overflow2;
9707 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009708 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709 if (*p == '\n' || *p == '\r')
9710 j = 0;
9711 }
9712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 if (PyUnicode_READY(u) == -1) {
9714 Py_DECREF(u);
9715 return NULL;
9716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009718
9719 overflow2:
9720 Py_DECREF(u);
9721 overflow1:
9722 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724}
9725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009726PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009727 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728\n\
9729Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009730such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731arguments start and end are interpreted as in slice notation.\n\
9732\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009733Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734
9735static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737{
Jesus Ceaac451502011-04-20 17:09:23 +02009738 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009739 Py_ssize_t start;
9740 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009741 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742
Jesus Ceaac451502011-04-20 17:09:23 +02009743 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9744 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747 if (PyUnicode_READY(self) == -1)
9748 return NULL;
9749 if (PyUnicode_READY(substring) == -1)
9750 return NULL;
9751
9752 result = any_find_slice(
9753 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9754 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009755 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756
9757 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009759 if (result == -2)
9760 return NULL;
9761
Christian Heimes217cfd12007-12-02 14:31:20 +00009762 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009763}
9764
9765static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009766unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 Py_UCS4 ch;
9769
9770 if (PyUnicode_READY(self) == -1)
9771 return NULL;
9772 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773 PyErr_SetString(PyExc_IndexError, "string index out of range");
9774 return NULL;
9775 }
9776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9778 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779}
9780
Guido van Rossumc2504932007-09-18 19:42:40 +00009781/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009782 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009783static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009784unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785{
Guido van Rossumc2504932007-09-18 19:42:40 +00009786 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009787 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 if (_PyUnicode_HASH(self) != -1)
9790 return _PyUnicode_HASH(self);
9791 if (PyUnicode_READY(self) == -1)
9792 return -1;
9793 len = PyUnicode_GET_LENGTH(self);
9794
9795 /* The hash function as a macro, gets expanded three times below. */
9796#define HASH(P) \
9797 x = (Py_uhash_t)*P << 7; \
9798 while (--len >= 0) \
9799 x = (1000003*x) ^ (Py_uhash_t)*P++;
9800
9801 switch (PyUnicode_KIND(self)) {
9802 case PyUnicode_1BYTE_KIND: {
9803 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9804 HASH(c);
9805 break;
9806 }
9807 case PyUnicode_2BYTE_KIND: {
9808 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9809 HASH(s);
9810 break;
9811 }
9812 default: {
9813 Py_UCS4 *l;
9814 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9815 "Impossible switch case in unicode_hash");
9816 l = PyUnicode_4BYTE_DATA(self);
9817 HASH(l);
9818 break;
9819 }
9820 }
9821 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9822
Guido van Rossumc2504932007-09-18 19:42:40 +00009823 if (x == -1)
9824 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009826 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009830PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009831 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009833Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834
9835static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009838 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009839 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009840 Py_ssize_t start;
9841 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009842
Jesus Ceaac451502011-04-20 17:09:23 +02009843 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9844 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 if (PyUnicode_READY(self) == -1)
9848 return NULL;
9849 if (PyUnicode_READY(substring) == -1)
9850 return NULL;
9851
9852 result = any_find_slice(
9853 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9854 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009855 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856
9857 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 if (result == -2)
9860 return NULL;
9861
Guido van Rossumd57fd912000-03-10 22:53:23 +00009862 if (result < 0) {
9863 PyErr_SetString(PyExc_ValueError, "substring not found");
9864 return NULL;
9865 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009866
Christian Heimes217cfd12007-12-02 14:31:20 +00009867 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868}
9869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009870PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009873Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009874at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875
9876static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009877unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009878{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879 Py_ssize_t i, length;
9880 int kind;
9881 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882 int cased;
9883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 if (PyUnicode_READY(self) == -1)
9885 return NULL;
9886 length = PyUnicode_GET_LENGTH(self);
9887 kind = PyUnicode_KIND(self);
9888 data = PyUnicode_DATA(self);
9889
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 if (length == 1)
9892 return PyBool_FromLong(
9893 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009895 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009897 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009898
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 for (i = 0; i < length; i++) {
9901 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009902
Benjamin Peterson29060642009-01-31 22:14:21 +00009903 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9904 return PyBool_FromLong(0);
9905 else if (!cased && Py_UNICODE_ISLOWER(ch))
9906 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009908 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909}
9910
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009911PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009912 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009914Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009915at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916
9917static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009918unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 Py_ssize_t i, length;
9921 int kind;
9922 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923 int cased;
9924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 if (PyUnicode_READY(self) == -1)
9926 return NULL;
9927 length = PyUnicode_GET_LENGTH(self);
9928 kind = PyUnicode_KIND(self);
9929 data = PyUnicode_DATA(self);
9930
Guido van Rossumd57fd912000-03-10 22:53:23 +00009931 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 if (length == 1)
9933 return PyBool_FromLong(
9934 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009936 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009938 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009939
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 for (i = 0; i < length; i++) {
9942 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009943
Benjamin Peterson29060642009-01-31 22:14:21 +00009944 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9945 return PyBool_FromLong(0);
9946 else if (!cased && Py_UNICODE_ISUPPER(ch))
9947 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009949 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950}
9951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009952PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009953 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009954\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009955Return True if S is a titlecased string and there is at least one\n\
9956character in S, i.e. upper- and titlecase characters may only\n\
9957follow uncased characters and lowercase characters only cased ones.\n\
9958Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959
9960static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009961unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009962{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 Py_ssize_t i, length;
9964 int kind;
9965 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966 int cased, previous_is_cased;
9967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 if (PyUnicode_READY(self) == -1)
9969 return NULL;
9970 length = PyUnicode_GET_LENGTH(self);
9971 kind = PyUnicode_KIND(self);
9972 data = PyUnicode_DATA(self);
9973
Guido van Rossumd57fd912000-03-10 22:53:23 +00009974 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 if (length == 1) {
9976 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
9977 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
9978 (Py_UNICODE_ISUPPER(ch) != 0));
9979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009981 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009984
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985 cased = 0;
9986 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 for (i = 0; i < length; i++) {
9988 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009989
Benjamin Peterson29060642009-01-31 22:14:21 +00009990 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
9991 if (previous_is_cased)
9992 return PyBool_FromLong(0);
9993 previous_is_cased = 1;
9994 cased = 1;
9995 }
9996 else if (Py_UNICODE_ISLOWER(ch)) {
9997 if (!previous_is_cased)
9998 return PyBool_FromLong(0);
9999 previous_is_cased = 1;
10000 cased = 1;
10001 }
10002 else
10003 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010005 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006}
10007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010008PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010009 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010011Return True if all characters in S are whitespace\n\
10012and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010013
10014static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010015unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 Py_ssize_t i, length;
10018 int kind;
10019 void *data;
10020
10021 if (PyUnicode_READY(self) == -1)
10022 return NULL;
10023 length = PyUnicode_GET_LENGTH(self);
10024 kind = PyUnicode_KIND(self);
10025 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 if (length == 1)
10029 return PyBool_FromLong(
10030 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010032 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010034 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 for (i = 0; i < length; i++) {
10037 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010038 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010039 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010041 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010042}
10043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010044PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010045 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010046\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010047Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010048and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010049
10050static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010051unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010052{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 Py_ssize_t i, length;
10054 int kind;
10055 void *data;
10056
10057 if (PyUnicode_READY(self) == -1)
10058 return NULL;
10059 length = PyUnicode_GET_LENGTH(self);
10060 kind = PyUnicode_KIND(self);
10061 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010062
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010063 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 if (length == 1)
10065 return PyBool_FromLong(
10066 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010067
10068 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010070 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 for (i = 0; i < length; i++) {
10073 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010074 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010075 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010076 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010077}
10078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010079PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010080 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010081\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010082Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010083and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010084
10085static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010086unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 int kind;
10089 void *data;
10090 Py_ssize_t len, i;
10091
10092 if (PyUnicode_READY(self) == -1)
10093 return NULL;
10094
10095 kind = PyUnicode_KIND(self);
10096 data = PyUnicode_DATA(self);
10097 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010098
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010099 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 if (len == 1) {
10101 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10102 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10103 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010104
10105 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010107 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 for (i = 0; i < len; i++) {
10110 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010111 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010112 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010113 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010114 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010115}
10116
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010117PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010118 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010119\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010120Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010121False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010122
10123static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010124unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 Py_ssize_t i, length;
10127 int kind;
10128 void *data;
10129
10130 if (PyUnicode_READY(self) == -1)
10131 return NULL;
10132 length = PyUnicode_GET_LENGTH(self);
10133 kind = PyUnicode_KIND(self);
10134 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 if (length == 1)
10138 return PyBool_FromLong(
10139 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010141 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010143 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 for (i = 0; i < length; i++) {
10146 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010147 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010149 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150}
10151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010152PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010153 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010155Return True if all characters in S are digits\n\
10156and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157
10158static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010159unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 Py_ssize_t i, length;
10162 int kind;
10163 void *data;
10164
10165 if (PyUnicode_READY(self) == -1)
10166 return NULL;
10167 length = PyUnicode_GET_LENGTH(self);
10168 kind = PyUnicode_KIND(self);
10169 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 if (length == 1) {
10173 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10174 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010177 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010179 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 for (i = 0; i < length; i++) {
10182 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010183 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010185 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186}
10187
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010188PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010189 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010190\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010191Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010192False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193
10194static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010195unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 Py_ssize_t i, length;
10198 int kind;
10199 void *data;
10200
10201 if (PyUnicode_READY(self) == -1)
10202 return NULL;
10203 length = PyUnicode_GET_LENGTH(self);
10204 kind = PyUnicode_KIND(self);
10205 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 if (length == 1)
10209 return PyBool_FromLong(
10210 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010212 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010214 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 for (i = 0; i < length; i++) {
10217 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010218 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010220 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221}
10222
Martin v. Löwis47383402007-08-15 07:32:56 +000010223int
10224PyUnicode_IsIdentifier(PyObject *self)
10225{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 int kind;
10227 void *data;
10228 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010229 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 if (PyUnicode_READY(self) == -1) {
10232 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010233 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 }
10235
10236 /* Special case for empty strings */
10237 if (PyUnicode_GET_LENGTH(self) == 0)
10238 return 0;
10239 kind = PyUnicode_KIND(self);
10240 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010241
10242 /* PEP 3131 says that the first character must be in
10243 XID_Start and subsequent characters in XID_Continue,
10244 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010245 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010246 letters, digits, underscore). However, given the current
10247 definition of XID_Start and XID_Continue, it is sufficient
10248 to check just for these, except that _ must be allowed
10249 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010251 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010252 return 0;
10253
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010254 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010256 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010257 return 1;
10258}
10259
10260PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010261 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010262\n\
10263Return True if S is a valid identifier according\n\
10264to the language definition.");
10265
10266static PyObject*
10267unicode_isidentifier(PyObject *self)
10268{
10269 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10270}
10271
Georg Brandl559e5d72008-06-11 18:37:52 +000010272PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010273 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010274\n\
10275Return True if all characters in S are considered\n\
10276printable in repr() or S is empty, False otherwise.");
10277
10278static PyObject*
10279unicode_isprintable(PyObject *self)
10280{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 Py_ssize_t i, length;
10282 int kind;
10283 void *data;
10284
10285 if (PyUnicode_READY(self) == -1)
10286 return NULL;
10287 length = PyUnicode_GET_LENGTH(self);
10288 kind = PyUnicode_KIND(self);
10289 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010290
10291 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 if (length == 1)
10293 return PyBool_FromLong(
10294 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 for (i = 0; i < length; i++) {
10297 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010298 Py_RETURN_FALSE;
10299 }
10300 }
10301 Py_RETURN_TRUE;
10302}
10303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010304PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010305 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306\n\
10307Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010308iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309
10310static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010311unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010313 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314}
10315
Martin v. Löwis18e16552006-02-15 17:27:45 +000010316static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317unicode_length(PyUnicodeObject *self)
10318{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 if (PyUnicode_READY(self) == -1)
10320 return -1;
10321 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322}
10323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010324PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010325 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010327Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010328done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329
10330static PyObject *
10331unicode_ljust(PyUnicodeObject *self, PyObject *args)
10332{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010333 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 Py_UCS4 fillchar = ' ';
10335
10336 if (PyUnicode_READY(self) == -1)
10337 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010338
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010339 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340 return NULL;
10341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343 Py_INCREF(self);
10344 return (PyObject*) self;
10345 }
10346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348}
10349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010350PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010351 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010353Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354
10355static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010356unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358 return fixup(self, fixlower);
10359}
10360
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010361#define LEFTSTRIP 0
10362#define RIGHTSTRIP 1
10363#define BOTHSTRIP 2
10364
10365/* Arrays indexed by above */
10366static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10367
10368#define STRIPNAME(i) (stripformat[i]+3)
10369
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010370/* externally visible for str.strip(unicode) */
10371PyObject *
10372_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10373{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 void *data;
10375 int kind;
10376 Py_ssize_t i, j, len;
10377 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10380 return NULL;
10381
10382 kind = PyUnicode_KIND(self);
10383 data = PyUnicode_DATA(self);
10384 len = PyUnicode_GET_LENGTH(self);
10385 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10386 PyUnicode_DATA(sepobj),
10387 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010388
Benjamin Peterson14339b62009-01-31 16:36:08 +000010389 i = 0;
10390 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 while (i < len &&
10392 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010393 i++;
10394 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010395 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010396
Benjamin Peterson14339b62009-01-31 16:36:08 +000010397 j = len;
10398 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010399 do {
10400 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 } while (j >= i &&
10402 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010403 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010404 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010405
Benjamin Peterson14339b62009-01-31 16:36:08 +000010406 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010407 Py_INCREF(self);
10408 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010409 }
10410 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010412}
10413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414/* Assumes an already ready self string. */
10415
10416static PyObject *
10417substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10418{
10419 const int kind = PyUnicode_KIND(self);
10420 void *data = PyUnicode_DATA(self);
10421 Py_UCS4 maxchar = 0;
10422 Py_ssize_t i;
10423 PyObject *unicode;
10424
10425 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10426 PyErr_BadInternalCall();
10427 return NULL;
10428 }
10429
10430 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10431 Py_INCREF(self);
10432 return (PyObject*)self;
10433 }
10434
10435 for (i = 0; i < len; ++i) {
10436 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10437 if (ch > maxchar)
10438 maxchar = ch;
10439 }
10440
10441 unicode = PyUnicode_New(len, maxchar);
10442 if (unicode == NULL)
10443 return NULL;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010444 if (PyUnicode_CopyCharacters(unicode, 0,
10445 (PyObject*)self, start, len) < 0)
10446 {
10447 Py_DECREF(unicode);
10448 return NULL;
10449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 return unicode;
10451}
10452
10453PyObject*
10454PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10455{
10456 unsigned char *data;
10457 int kind;
10458
10459 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10460 && PyUnicode_CheckExact(self))
10461 {
10462 Py_INCREF(self);
10463 return (PyObject *)self;
10464 }
10465
10466 if ((end - start) == 1)
10467 return unicode_getitem((PyUnicodeObject*)self, start);
10468
10469 if (PyUnicode_READY(self) == -1)
10470 return NULL;
10471 kind = PyUnicode_KIND(self);
10472 data = PyUnicode_1BYTE_DATA(self);
10473 return PyUnicode_FromKindAndData(kind, data + PyUnicode_KIND_SIZE(kind, start),
10474 end-start);
10475}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010476
10477static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010478do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 int kind;
10481 void *data;
10482 Py_ssize_t len, i, j;
10483
10484 if (PyUnicode_READY(self) == -1)
10485 return NULL;
10486
10487 kind = PyUnicode_KIND(self);
10488 data = PyUnicode_DATA(self);
10489 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010490
Benjamin Peterson14339b62009-01-31 16:36:08 +000010491 i = 0;
10492 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010494 i++;
10495 }
10496 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010497
Benjamin Peterson14339b62009-01-31 16:36:08 +000010498 j = len;
10499 if (striptype != LEFTSTRIP) {
10500 do {
10501 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010503 j++;
10504 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010505
Benjamin Peterson14339b62009-01-31 16:36:08 +000010506 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10507 Py_INCREF(self);
10508 return (PyObject*)self;
10509 }
10510 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010512}
10513
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010514
10515static PyObject *
10516do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10517{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010518 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010519
Benjamin Peterson14339b62009-01-31 16:36:08 +000010520 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10521 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010522
Benjamin Peterson14339b62009-01-31 16:36:08 +000010523 if (sep != NULL && sep != Py_None) {
10524 if (PyUnicode_Check(sep))
10525 return _PyUnicode_XStrip(self, striptype, sep);
10526 else {
10527 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010528 "%s arg must be None or str",
10529 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010530 return NULL;
10531 }
10532 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010533
Benjamin Peterson14339b62009-01-31 16:36:08 +000010534 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010535}
10536
10537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010538PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010539 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010540\n\
10541Return a copy of the string S with leading and trailing\n\
10542whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010543If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010544
10545static PyObject *
10546unicode_strip(PyUnicodeObject *self, PyObject *args)
10547{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010548 if (PyTuple_GET_SIZE(args) == 0)
10549 return do_strip(self, BOTHSTRIP); /* Common case */
10550 else
10551 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010552}
10553
10554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010555PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010556 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010557\n\
10558Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010559If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010560
10561static PyObject *
10562unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10563{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010564 if (PyTuple_GET_SIZE(args) == 0)
10565 return do_strip(self, LEFTSTRIP); /* Common case */
10566 else
10567 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010568}
10569
10570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010571PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010572 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010573\n\
10574Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010575If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010576
10577static PyObject *
10578unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10579{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010580 if (PyTuple_GET_SIZE(args) == 0)
10581 return do_strip(self, RIGHTSTRIP); /* Common case */
10582 else
10583 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010584}
10585
10586
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010588unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589{
10590 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 Py_ssize_t nchars, n;
10592 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593
Georg Brandl222de0f2009-04-12 12:01:50 +000010594 if (len < 1) {
10595 Py_INCREF(unicode_empty);
10596 return (PyObject *)unicode_empty;
10597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598
Tim Peters7a29bd52001-09-12 03:03:31 +000010599 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600 /* no repeat, return original string */
10601 Py_INCREF(str);
10602 return (PyObject*) str;
10603 }
Tim Peters8f422462000-09-09 06:13:41 +000010604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (PyUnicode_READY(str) == -1)
10606 return NULL;
10607
Tim Peters8f422462000-09-09 06:13:41 +000010608 /* ensure # of chars needed doesn't overflow int and # of bytes
10609 * needed doesn't overflow size_t
10610 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 nchars = len * PyUnicode_GET_LENGTH(str);
10612 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010613 PyErr_SetString(PyExc_OverflowError,
10614 "repeated string is too long");
10615 return NULL;
10616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 char_size = PyUnicode_CHARACTER_SIZE(str);
10618 nbytes = (nchars + 1) * char_size;
10619 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010620 PyErr_SetString(PyExc_OverflowError,
10621 "repeated string is too long");
10622 return NULL;
10623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625 if (!u)
10626 return NULL;
10627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 if (PyUnicode_GET_LENGTH(str) == 1) {
10629 const int kind = PyUnicode_KIND(str);
10630 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10631 void *to = PyUnicode_DATA(u);
10632 for (n = 0; n < len; ++n)
10633 PyUnicode_WRITE(kind, to, n, fill_char);
10634 }
10635 else {
10636 /* number of characters copied this far */
10637 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10638 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10639 char *to = (char *) PyUnicode_DATA(u);
10640 Py_MEMCPY(to, PyUnicode_DATA(str),
10641 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010642 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 n = (done <= nchars-done) ? done : nchars-done;
10644 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010645 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647 }
10648
10649 return (PyObject*) u;
10650}
10651
Alexander Belopolsky40018472011-02-26 01:02:56 +000010652PyObject *
10653PyUnicode_Replace(PyObject *obj,
10654 PyObject *subobj,
10655 PyObject *replobj,
10656 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657{
10658 PyObject *self;
10659 PyObject *str1;
10660 PyObject *str2;
10661 PyObject *result;
10662
10663 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010665 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010668 Py_DECREF(self);
10669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670 }
10671 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010673 Py_DECREF(self);
10674 Py_DECREF(str1);
10675 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678 Py_DECREF(self);
10679 Py_DECREF(str1);
10680 Py_DECREF(str2);
10681 return result;
10682}
10683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010684PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010685 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686\n\
10687Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010688old replaced by new. If the optional argument count is\n\
10689given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690
10691static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 PyObject *str1;
10695 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010696 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697 PyObject *result;
10698
Martin v. Löwis18e16552006-02-15 17:27:45 +000010699 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010702 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 str1 = PyUnicode_FromObject(str1);
10704 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10705 return NULL;
10706 str2 = PyUnicode_FromObject(str2);
10707 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010708 Py_DECREF(str1);
10709 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711
10712 result = replace(self, str1, str2, maxcount);
10713
10714 Py_DECREF(str1);
10715 Py_DECREF(str2);
10716 return result;
10717}
10718
Alexander Belopolsky40018472011-02-26 01:02:56 +000010719static PyObject *
10720unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010722 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 Py_ssize_t isize;
10724 Py_ssize_t osize, squote, dquote, i, o;
10725 Py_UCS4 max, quote;
10726 int ikind, okind;
10727 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010730 return NULL;
10731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010732 isize = PyUnicode_GET_LENGTH(unicode);
10733 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 /* Compute length of output, quote characters, and
10736 maximum character */
10737 osize = 2; /* quotes */
10738 max = 127;
10739 squote = dquote = 0;
10740 ikind = PyUnicode_KIND(unicode);
10741 for (i = 0; i < isize; i++) {
10742 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10743 switch (ch) {
10744 case '\'': squote++; osize++; break;
10745 case '"': dquote++; osize++; break;
10746 case '\\': case '\t': case '\r': case '\n':
10747 osize += 2; break;
10748 default:
10749 /* Fast-path ASCII */
10750 if (ch < ' ' || ch == 0x7f)
10751 osize += 4; /* \xHH */
10752 else if (ch < 0x7f)
10753 osize++;
10754 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10755 osize++;
10756 max = ch > max ? ch : max;
10757 }
10758 else if (ch < 0x100)
10759 osize += 4; /* \xHH */
10760 else if (ch < 0x10000)
10761 osize += 6; /* \uHHHH */
10762 else
10763 osize += 10; /* \uHHHHHHHH */
10764 }
10765 }
10766
10767 quote = '\'';
10768 if (squote) {
10769 if (dquote)
10770 /* Both squote and dquote present. Use squote,
10771 and escape them */
10772 osize += squote;
10773 else
10774 quote = '"';
10775 }
10776
10777 repr = PyUnicode_New(osize, max);
10778 if (repr == NULL)
10779 return NULL;
10780 okind = PyUnicode_KIND(repr);
10781 odata = PyUnicode_DATA(repr);
10782
10783 PyUnicode_WRITE(okind, odata, 0, quote);
10784 PyUnicode_WRITE(okind, odata, osize-1, quote);
10785
10786 for (i = 0, o = 1; i < isize; i++) {
10787 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010788
10789 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 if ((ch == quote) || (ch == '\\')) {
10791 PyUnicode_WRITE(okind, odata, o++, '\\');
10792 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010793 continue;
10794 }
10795
Benjamin Peterson29060642009-01-31 22:14:21 +000010796 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010797 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 PyUnicode_WRITE(okind, odata, o++, '\\');
10799 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010800 }
10801 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 PyUnicode_WRITE(okind, odata, o++, '\\');
10803 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010804 }
10805 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 PyUnicode_WRITE(okind, odata, o++, '\\');
10807 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010808 }
10809
10810 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010811 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 PyUnicode_WRITE(okind, odata, o++, '\\');
10813 PyUnicode_WRITE(okind, odata, o++, 'x');
10814 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10815 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010816 }
10817
Georg Brandl559e5d72008-06-11 18:37:52 +000010818 /* Copy ASCII characters as-is */
10819 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010821 }
10822
Benjamin Peterson29060642009-01-31 22:14:21 +000010823 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010824 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010825 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010826 (categories Z* and C* except ASCII space)
10827 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010829 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 if (ch <= 0xff) {
10831 PyUnicode_WRITE(okind, odata, o++, '\\');
10832 PyUnicode_WRITE(okind, odata, o++, 'x');
10833 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10834 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010835 }
10836 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837 else if (ch >= 0x10000) {
10838 PyUnicode_WRITE(okind, odata, o++, '\\');
10839 PyUnicode_WRITE(okind, odata, o++, 'U');
10840 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10841 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10842 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10843 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10844 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10845 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10846 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10847 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010848 }
10849 /* Map 16-bit characters to '\uxxxx' */
10850 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 PyUnicode_WRITE(okind, odata, o++, '\\');
10852 PyUnicode_WRITE(okind, odata, o++, 'u');
10853 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10854 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10855 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10856 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010857 }
10858 }
10859 /* Copy characters as-is */
10860 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010862 }
10863 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010866 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867}
10868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010869PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010870 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871\n\
10872Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010873such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874arguments start and end are interpreted as in slice notation.\n\
10875\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010876Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877
10878static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880{
Jesus Ceaac451502011-04-20 17:09:23 +020010881 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010882 Py_ssize_t start;
10883 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010884 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885
Jesus Ceaac451502011-04-20 17:09:23 +020010886 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10887 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010888 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 if (PyUnicode_READY(self) == -1)
10891 return NULL;
10892 if (PyUnicode_READY(substring) == -1)
10893 return NULL;
10894
10895 result = any_find_slice(
10896 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10897 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010898 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010899
10900 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902 if (result == -2)
10903 return NULL;
10904
Christian Heimes217cfd12007-12-02 14:31:20 +000010905 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906}
10907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010908PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010909 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010911Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912
10913static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915{
Jesus Ceaac451502011-04-20 17:09:23 +020010916 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010917 Py_ssize_t start;
10918 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010919 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920
Jesus Ceaac451502011-04-20 17:09:23 +020010921 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10922 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 if (PyUnicode_READY(self) == -1)
10926 return NULL;
10927 if (PyUnicode_READY(substring) == -1)
10928 return NULL;
10929
10930 result = any_find_slice(
10931 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10932 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010933 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934
10935 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 if (result == -2)
10938 return NULL;
10939
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940 if (result < 0) {
10941 PyErr_SetString(PyExc_ValueError, "substring not found");
10942 return NULL;
10943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944
Christian Heimes217cfd12007-12-02 14:31:20 +000010945 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946}
10947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010948PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010949 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010951Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010952done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953
10954static PyObject *
10955unicode_rjust(PyUnicodeObject *self, PyObject *args)
10956{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010957 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 Py_UCS4 fillchar = ' ';
10959
10960 if (PyUnicode_READY(self) == -1)
10961 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010962
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010963 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964 return NULL;
10965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967 Py_INCREF(self);
10968 return (PyObject*) self;
10969 }
10970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972}
10973
Alexander Belopolsky40018472011-02-26 01:02:56 +000010974PyObject *
10975PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976{
10977 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010978
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979 s = PyUnicode_FromObject(s);
10980 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010981 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010982 if (sep != NULL) {
10983 sep = PyUnicode_FromObject(sep);
10984 if (sep == NULL) {
10985 Py_DECREF(s);
10986 return NULL;
10987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988 }
10989
10990 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10991
10992 Py_DECREF(s);
10993 Py_XDECREF(sep);
10994 return result;
10995}
10996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010997PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010998 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999\n\
11000Return a list of the words in S, using sep as the\n\
11001delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011002splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011003whitespace string is a separator and empty strings are\n\
11004removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005
11006static PyObject*
11007unicode_split(PyUnicodeObject *self, PyObject *args)
11008{
11009 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011010 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011
Martin v. Löwis18e16552006-02-15 17:27:45 +000011012 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 return NULL;
11014
11015 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021}
11022
Thomas Wouters477c8d52006-05-27 19:21:47 +000011023PyObject *
11024PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11025{
11026 PyObject* str_obj;
11027 PyObject* sep_obj;
11028 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 int kind1, kind2, kind;
11030 void *buf1 = NULL, *buf2 = NULL;
11031 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011032
11033 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011035 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011036 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011038 Py_DECREF(str_obj);
11039 return NULL;
11040 }
11041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 kind1 = PyUnicode_KIND(str_in);
11043 kind2 = PyUnicode_KIND(sep_obj);
11044 kind = kind1 > kind2 ? kind1 : kind2;
11045 buf1 = PyUnicode_DATA(str_in);
11046 if (kind1 != kind)
11047 buf1 = _PyUnicode_AsKind(str_in, kind);
11048 if (!buf1)
11049 goto onError;
11050 buf2 = PyUnicode_DATA(sep_obj);
11051 if (kind2 != kind)
11052 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11053 if (!buf2)
11054 goto onError;
11055 len1 = PyUnicode_GET_LENGTH(str_obj);
11056 len2 = PyUnicode_GET_LENGTH(sep_obj);
11057
11058 switch(PyUnicode_KIND(str_in)) {
11059 case PyUnicode_1BYTE_KIND:
11060 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11061 break;
11062 case PyUnicode_2BYTE_KIND:
11063 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11064 break;
11065 case PyUnicode_4BYTE_KIND:
11066 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11067 break;
11068 default:
11069 assert(0);
11070 out = 0;
11071 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011072
11073 Py_DECREF(sep_obj);
11074 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011075 if (kind1 != kind)
11076 PyMem_Free(buf1);
11077 if (kind2 != kind)
11078 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011079
11080 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 onError:
11082 Py_DECREF(sep_obj);
11083 Py_DECREF(str_obj);
11084 if (kind1 != kind && buf1)
11085 PyMem_Free(buf1);
11086 if (kind2 != kind && buf2)
11087 PyMem_Free(buf2);
11088 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011089}
11090
11091
11092PyObject *
11093PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11094{
11095 PyObject* str_obj;
11096 PyObject* sep_obj;
11097 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 int kind1, kind2, kind;
11099 void *buf1 = NULL, *buf2 = NULL;
11100 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011101
11102 str_obj = PyUnicode_FromObject(str_in);
11103 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011104 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011105 sep_obj = PyUnicode_FromObject(sep_in);
11106 if (!sep_obj) {
11107 Py_DECREF(str_obj);
11108 return NULL;
11109 }
11110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 kind1 = PyUnicode_KIND(str_in);
11112 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011113 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011114 buf1 = PyUnicode_DATA(str_in);
11115 if (kind1 != kind)
11116 buf1 = _PyUnicode_AsKind(str_in, kind);
11117 if (!buf1)
11118 goto onError;
11119 buf2 = PyUnicode_DATA(sep_obj);
11120 if (kind2 != kind)
11121 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11122 if (!buf2)
11123 goto onError;
11124 len1 = PyUnicode_GET_LENGTH(str_obj);
11125 len2 = PyUnicode_GET_LENGTH(sep_obj);
11126
11127 switch(PyUnicode_KIND(str_in)) {
11128 case PyUnicode_1BYTE_KIND:
11129 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11130 break;
11131 case PyUnicode_2BYTE_KIND:
11132 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11133 break;
11134 case PyUnicode_4BYTE_KIND:
11135 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11136 break;
11137 default:
11138 assert(0);
11139 out = 0;
11140 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011141
11142 Py_DECREF(sep_obj);
11143 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 if (kind1 != kind)
11145 PyMem_Free(buf1);
11146 if (kind2 != kind)
11147 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011148
11149 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 onError:
11151 Py_DECREF(sep_obj);
11152 Py_DECREF(str_obj);
11153 if (kind1 != kind && buf1)
11154 PyMem_Free(buf1);
11155 if (kind2 != kind && buf2)
11156 PyMem_Free(buf2);
11157 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011158}
11159
11160PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011161 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011162\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011163Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011164the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011165found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166
11167static PyObject*
11168unicode_partition(PyUnicodeObject *self, PyObject *separator)
11169{
11170 return PyUnicode_Partition((PyObject *)self, separator);
11171}
11172
11173PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011174 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011175\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011176Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011177the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011178separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011179
11180static PyObject*
11181unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11182{
11183 return PyUnicode_RPartition((PyObject *)self, separator);
11184}
11185
Alexander Belopolsky40018472011-02-26 01:02:56 +000011186PyObject *
11187PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011188{
11189 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011190
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011191 s = PyUnicode_FromObject(s);
11192 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011193 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011194 if (sep != NULL) {
11195 sep = PyUnicode_FromObject(sep);
11196 if (sep == NULL) {
11197 Py_DECREF(s);
11198 return NULL;
11199 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011200 }
11201
11202 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11203
11204 Py_DECREF(s);
11205 Py_XDECREF(sep);
11206 return result;
11207}
11208
11209PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011210 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011211\n\
11212Return a list of the words in S, using sep as the\n\
11213delimiter string, starting at the end of the string and\n\
11214working to the front. If maxsplit is given, at most maxsplit\n\
11215splits are done. If sep is not specified, any whitespace string\n\
11216is a separator.");
11217
11218static PyObject*
11219unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11220{
11221 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011222 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011223
Martin v. Löwis18e16552006-02-15 17:27:45 +000011224 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011225 return NULL;
11226
11227 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011228 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011229 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011230 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011231 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011232 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011233}
11234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011235PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237\n\
11238Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011239Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011240is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241
11242static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011243unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011245 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011246 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011248 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11249 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250 return NULL;
11251
Guido van Rossum86662912000-04-11 15:38:46 +000011252 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253}
11254
11255static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011256PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257{
Walter Dörwald346737f2007-05-31 10:44:43 +000011258 if (PyUnicode_CheckExact(self)) {
11259 Py_INCREF(self);
11260 return self;
11261 } else
11262 /* Subtype -- return genuine unicode string with the same value. */
11263 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
11264 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265}
11266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011267PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269\n\
11270Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011271and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272
11273static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011274unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 return fixup(self, fixswapcase);
11277}
11278
Georg Brandlceee0772007-11-27 23:48:05 +000011279PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011280 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011281\n\
11282Return a translation table usable for str.translate().\n\
11283If there is only one argument, it must be a dictionary mapping Unicode\n\
11284ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011285Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011286If there are two arguments, they must be strings of equal length, and\n\
11287in the resulting dictionary, each character in x will be mapped to the\n\
11288character at the same position in y. If there is a third argument, it\n\
11289must be a string, whose characters will be mapped to None in the result.");
11290
11291static PyObject*
11292unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11293{
11294 PyObject *x, *y = NULL, *z = NULL;
11295 PyObject *new = NULL, *key, *value;
11296 Py_ssize_t i = 0;
11297 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011298
Georg Brandlceee0772007-11-27 23:48:05 +000011299 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11300 return NULL;
11301 new = PyDict_New();
11302 if (!new)
11303 return NULL;
11304 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305 int x_kind, y_kind, z_kind;
11306 void *x_data, *y_data, *z_data;
11307
Georg Brandlceee0772007-11-27 23:48:05 +000011308 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011309 if (!PyUnicode_Check(x)) {
11310 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11311 "be a string if there is a second argument");
11312 goto err;
11313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011315 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11316 "arguments must have equal length");
11317 goto err;
11318 }
11319 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 x_kind = PyUnicode_KIND(x);
11321 y_kind = PyUnicode_KIND(y);
11322 x_data = PyUnicode_DATA(x);
11323 y_data = PyUnicode_DATA(y);
11324 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11325 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11326 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011327 if (!key || !value)
11328 goto err;
11329 res = PyDict_SetItem(new, key, value);
11330 Py_DECREF(key);
11331 Py_DECREF(value);
11332 if (res < 0)
11333 goto err;
11334 }
11335 /* create entries for deleting chars in z */
11336 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 z_kind = PyUnicode_KIND(z);
11338 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011339 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011341 if (!key)
11342 goto err;
11343 res = PyDict_SetItem(new, key, Py_None);
11344 Py_DECREF(key);
11345 if (res < 0)
11346 goto err;
11347 }
11348 }
11349 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350 int kind;
11351 void *data;
11352
Georg Brandlceee0772007-11-27 23:48:05 +000011353 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011354 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011355 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11356 "to maketrans it must be a dict");
11357 goto err;
11358 }
11359 /* copy entries into the new dict, converting string keys to int keys */
11360 while (PyDict_Next(x, &i, &key, &value)) {
11361 if (PyUnicode_Check(key)) {
11362 /* convert string keys to integer keys */
11363 PyObject *newkey;
11364 if (PyUnicode_GET_SIZE(key) != 1) {
11365 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11366 "table must be of length 1");
11367 goto err;
11368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 kind = PyUnicode_KIND(key);
11370 data = PyUnicode_DATA(key);
11371 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011372 if (!newkey)
11373 goto err;
11374 res = PyDict_SetItem(new, newkey, value);
11375 Py_DECREF(newkey);
11376 if (res < 0)
11377 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011378 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011379 /* just keep integer keys */
11380 if (PyDict_SetItem(new, key, value) < 0)
11381 goto err;
11382 } else {
11383 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11384 "be strings or integers");
11385 goto err;
11386 }
11387 }
11388 }
11389 return new;
11390 err:
11391 Py_DECREF(new);
11392 return NULL;
11393}
11394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011395PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011396 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397\n\
11398Return a copy of the string S, where all characters have been mapped\n\
11399through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011400Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011401Unmapped characters are left untouched. Characters mapped to None\n\
11402are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403
11404static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408}
11409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011413Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414
11415static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011416unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418 return fixup(self, fixupper);
11419}
11420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011421PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011422 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011424Pad a numeric string S with zeros on the left, to fill a field\n\
11425of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426
11427static PyObject *
11428unicode_zfill(PyUnicodeObject *self, PyObject *args)
11429{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011430 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011432 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 int kind;
11434 void *data;
11435 Py_UCS4 chr;
11436
11437 if (PyUnicode_READY(self) == -1)
11438 return NULL;
11439
Martin v. Löwis18e16552006-02-15 17:27:45 +000011440 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441 return NULL;
11442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011444 if (PyUnicode_CheckExact(self)) {
11445 Py_INCREF(self);
11446 return (PyObject*) self;
11447 }
11448 else
11449 return PyUnicode_FromUnicode(
11450 PyUnicode_AS_UNICODE(self),
11451 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453 }
11454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456
11457 u = pad(self, fill, 0, '0');
11458
Walter Dörwald068325e2002-04-15 13:36:47 +000011459 if (u == NULL)
11460 return NULL;
11461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 kind = PyUnicode_KIND(u);
11463 data = PyUnicode_DATA(u);
11464 chr = PyUnicode_READ(kind, data, fill);
11465
11466 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 PyUnicode_WRITE(kind, data, 0, chr);
11469 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470 }
11471
11472 return (PyObject*) u;
11473}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474
11475#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011476static PyObject *
11477unicode__decimal2ascii(PyObject *self)
11478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011479 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011480}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481#endif
11482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011483PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011486Return True if S starts with the specified prefix, False otherwise.\n\
11487With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011488With optional end, stop comparing S at that position.\n\
11489prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
11491static PyObject *
11492unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011495 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011497 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011498 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011499 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500
Jesus Ceaac451502011-04-20 17:09:23 +020011501 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011502 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011503 if (PyTuple_Check(subobj)) {
11504 Py_ssize_t i;
11505 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11506 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011508 if (substring == NULL)
11509 return NULL;
11510 result = tailmatch(self, substring, start, end, -1);
11511 Py_DECREF(substring);
11512 if (result) {
11513 Py_RETURN_TRUE;
11514 }
11515 }
11516 /* nothing matched */
11517 Py_RETURN_FALSE;
11518 }
11519 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011520 if (substring == NULL) {
11521 if (PyErr_ExceptionMatches(PyExc_TypeError))
11522 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11523 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011524 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011525 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011526 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011528 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529}
11530
11531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011532PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011533 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011535Return True if S ends with the specified suffix, False otherwise.\n\
11536With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011537With optional end, stop comparing S at that position.\n\
11538suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539
11540static PyObject *
11541unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011542 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011544 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011546 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011547 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011548 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549
Jesus Ceaac451502011-04-20 17:09:23 +020011550 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011552 if (PyTuple_Check(subobj)) {
11553 Py_ssize_t i;
11554 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11555 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011557 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011559 result = tailmatch(self, substring, start, end, +1);
11560 Py_DECREF(substring);
11561 if (result) {
11562 Py_RETURN_TRUE;
11563 }
11564 }
11565 Py_RETURN_FALSE;
11566 }
11567 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011568 if (substring == NULL) {
11569 if (PyErr_ExceptionMatches(PyExc_TypeError))
11570 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11571 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011572 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011573 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011574 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011576 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577}
11578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011580
11581PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011583\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011584Return a formatted version of S, using substitutions from args and kwargs.\n\
11585The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011586
Eric Smith27bbca62010-11-04 17:06:58 +000011587PyDoc_STRVAR(format_map__doc__,
11588 "S.format_map(mapping) -> str\n\
11589\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011590Return a formatted version of S, using substitutions from mapping.\n\
11591The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011592
Eric Smith4a7d76d2008-05-30 18:10:19 +000011593static PyObject *
11594unicode__format__(PyObject* self, PyObject* args)
11595{
11596 PyObject *format_spec;
11597
11598 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11599 return NULL;
11600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11602 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011603}
11604
Eric Smith8c663262007-08-25 02:26:07 +000011605PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011607\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011608Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011609
11610static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011611unicode__sizeof__(PyUnicodeObject *v)
11612{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 Py_ssize_t size;
11614
11615 /* If it's a compact object, account for base structure +
11616 character data. */
11617 if (PyUnicode_IS_COMPACT_ASCII(v))
11618 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11619 else if (PyUnicode_IS_COMPACT(v))
11620 size = sizeof(PyCompactUnicodeObject) +
11621 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11622 else {
11623 /* If it is a two-block object, account for base object, and
11624 for character block if present. */
11625 size = sizeof(PyUnicodeObject);
11626 if (v->data.any)
11627 size += (PyUnicode_GET_LENGTH(v) + 1) *
11628 PyUnicode_CHARACTER_SIZE(v);
11629 }
11630 /* If the wstr pointer is present, account for it unless it is shared
11631 with the data pointer. Since PyUnicode_DATA will crash if the object
11632 is not ready, check whether it's either not ready (in which case the
11633 data is entirely in wstr) or if the data is not shared. */
11634 if (_PyUnicode_WSTR(v) &&
11635 (!PyUnicode_IS_READY(v) ||
11636 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11637 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11638 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11639 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11640
11641 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011642}
11643
11644PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011645 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011646
11647static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011648unicode_getnewargs(PyUnicodeObject *v)
11649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 PyObject *copy;
11651 unsigned char *data;
11652 int kind;
11653 if (PyUnicode_READY(v) == -1)
11654 return NULL;
11655 kind = PyUnicode_KIND(v);
11656 data = PyUnicode_1BYTE_DATA(v);
11657 copy = PyUnicode_FromKindAndData(kind, data, PyUnicode_GET_LENGTH(v));
11658 if (!copy)
11659 return NULL;
11660 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011661}
11662
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663static PyMethodDef unicode_methods[] = {
11664
11665 /* Order is according to common usage: often used methods should
11666 appear first, since lookup is done sequentially. */
11667
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011668 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011669 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11670 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011671 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011672 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11673 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11674 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11675 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11676 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11677 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11678 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011679 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011680 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11681 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11682 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011683 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011684 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11685 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11686 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011687 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011688 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011689 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011690 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011691 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11692 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11693 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11694 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11695 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11696 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11697 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11698 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11699 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11700 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11701 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11702 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11703 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11704 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011705 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011706 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011707 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011708 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011709 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011710 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011711 {"maketrans", (PyCFunction) unicode_maketrans,
11712 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011713 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011714#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011715 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716#endif
11717
11718#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011719 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011720 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721#endif
11722
Benjamin Peterson14339b62009-01-31 16:36:08 +000011723 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724 {NULL, NULL}
11725};
11726
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011727static PyObject *
11728unicode_mod(PyObject *v, PyObject *w)
11729{
Brian Curtindfc80e32011-08-10 20:28:54 -050011730 if (!PyUnicode_Check(v))
11731 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011732 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011733}
11734
11735static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011736 0, /*nb_add*/
11737 0, /*nb_subtract*/
11738 0, /*nb_multiply*/
11739 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011740};
11741
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011743 (lenfunc) unicode_length, /* sq_length */
11744 PyUnicode_Concat, /* sq_concat */
11745 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11746 (ssizeargfunc) unicode_getitem, /* sq_item */
11747 0, /* sq_slice */
11748 0, /* sq_ass_item */
11749 0, /* sq_ass_slice */
11750 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751};
11752
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011753static PyObject*
11754unicode_subscript(PyUnicodeObject* self, PyObject* item)
11755{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 if (PyUnicode_READY(self) == -1)
11757 return NULL;
11758
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011759 if (PyIndex_Check(item)) {
11760 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011761 if (i == -1 && PyErr_Occurred())
11762 return NULL;
11763 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011765 return unicode_getitem(self, i);
11766 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011767 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011769 Py_UNICODE* result_buf;
11770 PyObject* result;
11771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011773 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011774 return NULL;
11775 }
11776
11777 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 return PyUnicode_New(0, 0);
11779 } else if (start == 0 && step == 1 &&
11780 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011781 PyUnicode_CheckExact(self)) {
11782 Py_INCREF(self);
11783 return (PyObject *)self;
11784 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011786 } else {
11787 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011788 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11789 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011790
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 if (result_buf == NULL)
11792 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011793
11794 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11795 result_buf[i] = source_buf[cur];
11796 }
Tim Petersced69f82003-09-16 20:30:58 +000011797
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011798 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011799 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011800 return result;
11801 }
11802 } else {
11803 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11804 return NULL;
11805 }
11806}
11807
11808static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011809 (lenfunc)unicode_length, /* mp_length */
11810 (binaryfunc)unicode_subscript, /* mp_subscript */
11811 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011812};
11813
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815/* Helpers for PyUnicode_Format() */
11816
11817static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011818getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011820 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011822 (*p_argidx)++;
11823 if (arglen < 0)
11824 return args;
11825 else
11826 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827 }
11828 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830 return NULL;
11831}
11832
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011833/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011835static PyObject *
11836formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011838 char *p;
11839 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011841
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842 x = PyFloat_AsDouble(v);
11843 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011844 return NULL;
11845
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011847 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011848
Eric Smith0923d1d2009-04-16 20:16:10 +000011849 p = PyOS_double_to_string(x, type, prec,
11850 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011851 if (p == NULL)
11852 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011854 PyMem_Free(p);
11855 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856}
11857
Tim Peters38fd5b62000-09-21 05:43:11 +000011858static PyObject*
11859formatlong(PyObject *val, int flags, int prec, int type)
11860{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011861 char *buf;
11862 int len;
11863 PyObject *str; /* temporary string object. */
11864 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011865
Benjamin Peterson14339b62009-01-31 16:36:08 +000011866 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11867 if (!str)
11868 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011870 Py_DECREF(str);
11871 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011872}
11873
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011876 size_t buflen,
11877 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011879 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011880 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 if (PyUnicode_GET_LENGTH(v) == 1) {
11882 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 buf[1] = '\0';
11884 return 1;
11885 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011886 goto onError;
11887 }
11888 else {
11889 /* Integer input truncated to a character */
11890 long x;
11891 x = PyLong_AsLong(v);
11892 if (x == -1 && PyErr_Occurred())
11893 goto onError;
11894
11895 if (x < 0 || x > 0x10ffff) {
11896 PyErr_SetString(PyExc_OverflowError,
11897 "%c arg not in range(0x110000)");
11898 return -1;
11899 }
11900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011902 buf[1] = '\0';
11903 return 1;
11904 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011905
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011907 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011909 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910}
11911
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011912/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011913 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011914*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011915#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011916
Alexander Belopolsky40018472011-02-26 01:02:56 +000011917PyObject *
11918PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 void *fmt;
11921 int fmtkind;
11922 PyObject *result;
11923 Py_UCS4 *res, *res0;
11924 Py_UCS4 max;
11925 int kind;
11926 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011930
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011932 PyErr_BadInternalCall();
11933 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11936 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011937 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 fmt = PyUnicode_DATA(uformat);
11939 fmtkind = PyUnicode_KIND(uformat);
11940 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11941 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942
11943 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11945 if (res0 == NULL) {
11946 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011947 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949
11950 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011951 arglen = PyTuple_Size(args);
11952 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953 }
11954 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 arglen = -1;
11956 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011958 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011959 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961
11962 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 if (--rescnt < 0) {
11965 rescnt = fmtcnt + 100;
11966 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11968 if (res0 == NULL){
11969 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 }
11972 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011973 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011974 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011976 }
11977 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011978 /* Got a format specifier */
11979 int flags = 0;
11980 Py_ssize_t width = -1;
11981 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 Py_UCS4 c = '\0';
11983 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011984 int isnumok;
11985 PyObject *v = NULL;
11986 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 void *pbuf;
11988 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 Py_ssize_t len, len1;
11991 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 fmtpos++;
11994 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
11995 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 Py_ssize_t keylen;
11997 PyObject *key;
11998 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000011999
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 if (dict == NULL) {
12001 PyErr_SetString(PyExc_TypeError,
12002 "format requires a mapping");
12003 goto onError;
12004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012006 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012008 /* Skip over balanced parentheses */
12009 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012011 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012013 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012017 if (fmtcnt < 0 || pcount > 0) {
12018 PyErr_SetString(PyExc_ValueError,
12019 "incomplete format key");
12020 goto onError;
12021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012023 if (key == NULL)
12024 goto onError;
12025 if (args_owned) {
12026 Py_DECREF(args);
12027 args_owned = 0;
12028 }
12029 args = PyObject_GetItem(dict, key);
12030 Py_DECREF(key);
12031 if (args == NULL) {
12032 goto onError;
12033 }
12034 args_owned = 1;
12035 arglen = -1;
12036 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012037 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012038 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012040 case '-': flags |= F_LJUST; continue;
12041 case '+': flags |= F_SIGN; continue;
12042 case ' ': flags |= F_BLANK; continue;
12043 case '#': flags |= F_ALT; continue;
12044 case '0': flags |= F_ZERO; continue;
12045 }
12046 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012047 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012048 if (c == '*') {
12049 v = getnextarg(args, arglen, &argidx);
12050 if (v == NULL)
12051 goto onError;
12052 if (!PyLong_Check(v)) {
12053 PyErr_SetString(PyExc_TypeError,
12054 "* wants int");
12055 goto onError;
12056 }
12057 width = PyLong_AsLong(v);
12058 if (width == -1 && PyErr_Occurred())
12059 goto onError;
12060 if (width < 0) {
12061 flags |= F_LJUST;
12062 width = -width;
12063 }
12064 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012065 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012066 }
12067 else if (c >= '0' && c <= '9') {
12068 width = c - '0';
12069 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012071 if (c < '0' || c > '9')
12072 break;
12073 if ((width*10) / 10 != width) {
12074 PyErr_SetString(PyExc_ValueError,
12075 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012076 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012077 }
12078 width = width*10 + (c - '0');
12079 }
12080 }
12081 if (c == '.') {
12082 prec = 0;
12083 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012084 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012085 if (c == '*') {
12086 v = getnextarg(args, arglen, &argidx);
12087 if (v == NULL)
12088 goto onError;
12089 if (!PyLong_Check(v)) {
12090 PyErr_SetString(PyExc_TypeError,
12091 "* wants int");
12092 goto onError;
12093 }
12094 prec = PyLong_AsLong(v);
12095 if (prec == -1 && PyErr_Occurred())
12096 goto onError;
12097 if (prec < 0)
12098 prec = 0;
12099 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012101 }
12102 else if (c >= '0' && c <= '9') {
12103 prec = c - '0';
12104 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012106 if (c < '0' || c > '9')
12107 break;
12108 if ((prec*10) / 10 != prec) {
12109 PyErr_SetString(PyExc_ValueError,
12110 "prec too big");
12111 goto onError;
12112 }
12113 prec = prec*10 + (c - '0');
12114 }
12115 }
12116 } /* prec */
12117 if (fmtcnt >= 0) {
12118 if (c == 'h' || c == 'l' || c == 'L') {
12119 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012121 }
12122 }
12123 if (fmtcnt < 0) {
12124 PyErr_SetString(PyExc_ValueError,
12125 "incomplete format");
12126 goto onError;
12127 }
12128 if (c != '%') {
12129 v = getnextarg(args, arglen, &argidx);
12130 if (v == NULL)
12131 goto onError;
12132 }
12133 sign = 0;
12134 fill = ' ';
12135 switch (c) {
12136
12137 case '%':
12138 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 len = 1;
12143 break;
12144
12145 case 's':
12146 case 'r':
12147 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012148 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012149 temp = v;
12150 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012151 }
12152 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012153 if (c == 's')
12154 temp = PyObject_Str(v);
12155 else if (c == 'r')
12156 temp = PyObject_Repr(v);
12157 else
12158 temp = PyObject_ASCII(v);
12159 if (temp == NULL)
12160 goto onError;
12161 if (PyUnicode_Check(temp))
12162 /* nothing to do */;
12163 else {
12164 Py_DECREF(temp);
12165 PyErr_SetString(PyExc_TypeError,
12166 "%s argument has non-string str()");
12167 goto onError;
12168 }
12169 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 if (PyUnicode_READY(temp) == -1) {
12171 Py_CLEAR(temp);
12172 goto onError;
12173 }
12174 pbuf = PyUnicode_DATA(temp);
12175 kind = PyUnicode_KIND(temp);
12176 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012177 if (prec >= 0 && len > prec)
12178 len = prec;
12179 break;
12180
12181 case 'i':
12182 case 'd':
12183 case 'u':
12184 case 'o':
12185 case 'x':
12186 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012187 isnumok = 0;
12188 if (PyNumber_Check(v)) {
12189 PyObject *iobj=NULL;
12190
12191 if (PyLong_Check(v)) {
12192 iobj = v;
12193 Py_INCREF(iobj);
12194 }
12195 else {
12196 iobj = PyNumber_Long(v);
12197 }
12198 if (iobj!=NULL) {
12199 if (PyLong_Check(iobj)) {
12200 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012201 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012202 Py_DECREF(iobj);
12203 if (!temp)
12204 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 if (PyUnicode_READY(temp) == -1) {
12206 Py_CLEAR(temp);
12207 goto onError;
12208 }
12209 pbuf = PyUnicode_DATA(temp);
12210 kind = PyUnicode_KIND(temp);
12211 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012212 sign = 1;
12213 }
12214 else {
12215 Py_DECREF(iobj);
12216 }
12217 }
12218 }
12219 if (!isnumok) {
12220 PyErr_Format(PyExc_TypeError,
12221 "%%%c format: a number is required, "
12222 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12223 goto onError;
12224 }
12225 if (flags & F_ZERO)
12226 fill = '0';
12227 break;
12228
12229 case 'e':
12230 case 'E':
12231 case 'f':
12232 case 'F':
12233 case 'g':
12234 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012235 temp = formatfloat(v, flags, prec, c);
12236 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012237 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 if (PyUnicode_READY(temp) == -1) {
12239 Py_CLEAR(temp);
12240 goto onError;
12241 }
12242 pbuf = PyUnicode_DATA(temp);
12243 kind = PyUnicode_KIND(temp);
12244 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 sign = 1;
12246 if (flags & F_ZERO)
12247 fill = '0';
12248 break;
12249
12250 case 'c':
12251 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
12254 if (len < 0)
12255 goto onError;
12256 break;
12257
12258 default:
12259 PyErr_Format(PyExc_ValueError,
12260 "unsupported format character '%c' (0x%x) "
12261 "at index %zd",
12262 (31<=c && c<=126) ? (char)c : '?',
12263 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012265 goto onError;
12266 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 /* pbuf is initialized here. */
12268 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012269 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12271 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12272 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012273 len--;
12274 }
12275 else if (flags & F_SIGN)
12276 sign = '+';
12277 else if (flags & F_BLANK)
12278 sign = ' ';
12279 else
12280 sign = 0;
12281 }
12282 if (width < len)
12283 width = len;
12284 if (rescnt - (sign != 0) < width) {
12285 reslen -= rescnt;
12286 rescnt = width + fmtcnt + 100;
12287 reslen += rescnt;
12288 if (reslen < 0) {
12289 Py_XDECREF(temp);
12290 PyErr_NoMemory();
12291 goto onError;
12292 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12294 if (res0 == 0) {
12295 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012296 Py_XDECREF(temp);
12297 goto onError;
12298 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 }
12301 if (sign) {
12302 if (fill != ' ')
12303 *res++ = sign;
12304 rescnt--;
12305 if (width > len)
12306 width--;
12307 }
12308 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12310 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12313 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012314 }
12315 rescnt -= 2;
12316 width -= 2;
12317 if (width < 0)
12318 width = 0;
12319 len -= 2;
12320 }
12321 if (width > len && !(flags & F_LJUST)) {
12322 do {
12323 --rescnt;
12324 *res++ = fill;
12325 } while (--width > len);
12326 }
12327 if (fill == ' ') {
12328 if (sign)
12329 *res++ = sign;
12330 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12332 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12333 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12334 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012335 }
12336 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 /* Copy all characters, preserving len */
12338 len1 = len;
12339 while (len1--) {
12340 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12341 rescnt--;
12342 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 while (--width >= len) {
12344 --rescnt;
12345 *res++ = ' ';
12346 }
12347 if (dict && (argidx < arglen) && c != '%') {
12348 PyErr_SetString(PyExc_TypeError,
12349 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012350 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012351 goto onError;
12352 }
12353 Py_XDECREF(temp);
12354 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355 } /* until end */
12356 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012357 PyErr_SetString(PyExc_TypeError,
12358 "not all arguments converted during string formatting");
12359 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360 }
12361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362
12363 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12364 if (*res > max)
12365 max = *res;
12366 result = PyUnicode_New(reslen - rescnt, max);
12367 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012368 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 kind = PyUnicode_KIND(result);
12370 for (res = res0; res < res0+reslen-rescnt; res++)
12371 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12372 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012374 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375 }
12376 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377 return (PyObject *)result;
12378
Benjamin Peterson29060642009-01-31 22:14:21 +000012379 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381 Py_DECREF(uformat);
12382 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384 }
12385 return NULL;
12386}
12387
Jeremy Hylton938ace62002-07-17 16:30:39 +000012388static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012389unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12390
Tim Peters6d6c1a32001-08-02 04:15:00 +000012391static PyObject *
12392unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12393{
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012395 static char *kwlist[] = {"object", "encoding", "errors", 0};
12396 char *encoding = NULL;
12397 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012398
Benjamin Peterson14339b62009-01-31 16:36:08 +000012399 if (type != &PyUnicode_Type)
12400 return unicode_subtype_new(type, args, kwds);
12401 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012402 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012403 return NULL;
12404 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012406 if (encoding == NULL && errors == NULL)
12407 return PyObject_Str(x);
12408 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012409 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012410}
12411
Guido van Rossume023fe02001-08-30 03:12:59 +000012412static PyObject *
12413unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12414{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012415 PyUnicodeObject *tmp, *pnew;
12416 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012418
Benjamin Peterson14339b62009-01-31 16:36:08 +000012419 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12420 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12421 if (tmp == NULL)
12422 return NULL;
12423 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12425 // it seems kind of strange that tp_alloc gets passed the size
12426 // of the unicode string because there will follow another
12427 // malloc.
12428 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12429 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012430 if (pnew == NULL) {
12431 Py_DECREF(tmp);
12432 return NULL;
12433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12435 if (_PyUnicode_WSTR(pnew) == NULL) {
12436 err = PyErr_NoMemory();
12437 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12440 _PyUnicode_WSTR_LENGTH(pnew) = n;
12441 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12442 _PyUnicode_STATE(pnew).interned = 0;
12443 _PyUnicode_STATE(pnew).kind = 0;
12444 _PyUnicode_STATE(pnew).compact = 0;
12445 _PyUnicode_STATE(pnew).ready = 0;
12446 _PyUnicode_STATE(pnew).ascii = 0;
12447 pnew->data.any = NULL;
12448 _PyUnicode_LENGTH(pnew) = 0;
12449 pnew->_base.utf8 = NULL;
12450 pnew->_base.utf8_length = 0;
12451
12452 if (PyUnicode_READY(pnew) == -1) {
12453 PyObject_FREE(_PyUnicode_WSTR(pnew));
12454 goto onError;
12455 }
12456
Benjamin Peterson14339b62009-01-31 16:36:08 +000012457 Py_DECREF(tmp);
12458 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459
12460 onError:
12461 _Py_ForgetReference((PyObject *)pnew);
12462 PyObject_Del(pnew);
12463 Py_DECREF(tmp);
12464 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012465}
12466
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012467PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012468 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012469\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012470Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012471encoding defaults to the current default string encoding.\n\
12472errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012473
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012474static PyObject *unicode_iter(PyObject *seq);
12475
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012477 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012478 "str", /* tp_name */
12479 sizeof(PyUnicodeObject), /* tp_size */
12480 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012482 (destructor)unicode_dealloc, /* tp_dealloc */
12483 0, /* tp_print */
12484 0, /* tp_getattr */
12485 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012486 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012487 unicode_repr, /* tp_repr */
12488 &unicode_as_number, /* tp_as_number */
12489 &unicode_as_sequence, /* tp_as_sequence */
12490 &unicode_as_mapping, /* tp_as_mapping */
12491 (hashfunc) unicode_hash, /* tp_hash*/
12492 0, /* tp_call*/
12493 (reprfunc) unicode_str, /* tp_str */
12494 PyObject_GenericGetAttr, /* tp_getattro */
12495 0, /* tp_setattro */
12496 0, /* tp_as_buffer */
12497 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012498 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012499 unicode_doc, /* tp_doc */
12500 0, /* tp_traverse */
12501 0, /* tp_clear */
12502 PyUnicode_RichCompare, /* tp_richcompare */
12503 0, /* tp_weaklistoffset */
12504 unicode_iter, /* tp_iter */
12505 0, /* tp_iternext */
12506 unicode_methods, /* tp_methods */
12507 0, /* tp_members */
12508 0, /* tp_getset */
12509 &PyBaseObject_Type, /* tp_base */
12510 0, /* tp_dict */
12511 0, /* tp_descr_get */
12512 0, /* tp_descr_set */
12513 0, /* tp_dictoffset */
12514 0, /* tp_init */
12515 0, /* tp_alloc */
12516 unicode_new, /* tp_new */
12517 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518};
12519
12520/* Initialize the Unicode implementation */
12521
Thomas Wouters78890102000-07-22 19:25:51 +000012522void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012524 int i;
12525
Thomas Wouters477c8d52006-05-27 19:21:47 +000012526 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012528 0x000A, /* LINE FEED */
12529 0x000D, /* CARRIAGE RETURN */
12530 0x001C, /* FILE SEPARATOR */
12531 0x001D, /* GROUP SEPARATOR */
12532 0x001E, /* RECORD SEPARATOR */
12533 0x0085, /* NEXT LINE */
12534 0x2028, /* LINE SEPARATOR */
12535 0x2029, /* PARAGRAPH SEPARATOR */
12536 };
12537
Fred Drakee4315f52000-05-09 19:53:39 +000012538 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012540 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012542
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012543 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012544 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012545 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012546 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012547
12548 /* initialize the linebreak bloom filter */
12549 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 PyUnicode_2BYTE_KIND, linebreak,
12551 sizeof(linebreak) / sizeof(linebreak[0]));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012552
12553 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554}
12555
12556/* Finalize the Unicode implementation */
12557
Christian Heimesa156e092008-02-16 07:38:31 +000012558int
12559PyUnicode_ClearFreeList(void)
12560{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012562}
12563
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564void
Thomas Wouters78890102000-07-22 19:25:51 +000012565_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012567 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012569 Py_XDECREF(unicode_empty);
12570 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012571
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012572 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012573 if (unicode_latin1[i]) {
12574 Py_DECREF(unicode_latin1[i]);
12575 unicode_latin1[i] = NULL;
12576 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012577 }
Christian Heimesa156e092008-02-16 07:38:31 +000012578 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012580
Walter Dörwald16807132007-05-25 13:52:07 +000012581void
12582PyUnicode_InternInPlace(PyObject **p)
12583{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012584 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12585 PyObject *t;
12586 if (s == NULL || !PyUnicode_Check(s))
12587 Py_FatalError(
12588 "PyUnicode_InternInPlace: unicode strings only please!");
12589 /* If it's a subclass, we don't really know what putting
12590 it in the interned dict might do. */
12591 if (!PyUnicode_CheckExact(s))
12592 return;
12593 if (PyUnicode_CHECK_INTERNED(s))
12594 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595 if (PyUnicode_READY(s) == -1) {
12596 assert(0 && "ready fail in intern...");
12597 return;
12598 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012599 if (interned == NULL) {
12600 interned = PyDict_New();
12601 if (interned == NULL) {
12602 PyErr_Clear(); /* Don't leave an exception */
12603 return;
12604 }
12605 }
12606 /* It might be that the GetItem call fails even
12607 though the key is present in the dictionary,
12608 namely when this happens during a stack overflow. */
12609 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012611 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012612
Benjamin Peterson29060642009-01-31 22:14:21 +000012613 if (t) {
12614 Py_INCREF(t);
12615 Py_DECREF(*p);
12616 *p = t;
12617 return;
12618 }
Walter Dörwald16807132007-05-25 13:52:07 +000012619
Benjamin Peterson14339b62009-01-31 16:36:08 +000012620 PyThreadState_GET()->recursion_critical = 1;
12621 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12622 PyErr_Clear();
12623 PyThreadState_GET()->recursion_critical = 0;
12624 return;
12625 }
12626 PyThreadState_GET()->recursion_critical = 0;
12627 /* The two references in interned are not counted by refcnt.
12628 The deallocator will take care of this */
12629 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012631}
12632
12633void
12634PyUnicode_InternImmortal(PyObject **p)
12635{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12637
Benjamin Peterson14339b62009-01-31 16:36:08 +000012638 PyUnicode_InternInPlace(p);
12639 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012641 Py_INCREF(*p);
12642 }
Walter Dörwald16807132007-05-25 13:52:07 +000012643}
12644
12645PyObject *
12646PyUnicode_InternFromString(const char *cp)
12647{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012648 PyObject *s = PyUnicode_FromString(cp);
12649 if (s == NULL)
12650 return NULL;
12651 PyUnicode_InternInPlace(&s);
12652 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012653}
12654
Alexander Belopolsky40018472011-02-26 01:02:56 +000012655void
12656_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012657{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012658 PyObject *keys;
12659 PyUnicodeObject *s;
12660 Py_ssize_t i, n;
12661 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012662
Benjamin Peterson14339b62009-01-31 16:36:08 +000012663 if (interned == NULL || !PyDict_Check(interned))
12664 return;
12665 keys = PyDict_Keys(interned);
12666 if (keys == NULL || !PyList_Check(keys)) {
12667 PyErr_Clear();
12668 return;
12669 }
Walter Dörwald16807132007-05-25 13:52:07 +000012670
Benjamin Peterson14339b62009-01-31 16:36:08 +000012671 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12672 detector, interned unicode strings are not forcibly deallocated;
12673 rather, we give them their stolen references back, and then clear
12674 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012675
Benjamin Peterson14339b62009-01-31 16:36:08 +000012676 n = PyList_GET_SIZE(keys);
12677 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012678 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012679 for (i = 0; i < n; i++) {
12680 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 if (PyUnicode_READY(s) == -1)
12682 fprintf(stderr, "could not ready string\n");
12683 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012684 case SSTATE_NOT_INTERNED:
12685 /* XXX Shouldn't happen */
12686 break;
12687 case SSTATE_INTERNED_IMMORTAL:
12688 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012690 break;
12691 case SSTATE_INTERNED_MORTAL:
12692 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012694 break;
12695 default:
12696 Py_FatalError("Inconsistent interned string state.");
12697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012699 }
12700 fprintf(stderr, "total size of all interned strings: "
12701 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12702 "mortal/immortal\n", mortal_size, immortal_size);
12703 Py_DECREF(keys);
12704 PyDict_Clear(interned);
12705 Py_DECREF(interned);
12706 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012707}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012708
12709
12710/********************* Unicode Iterator **************************/
12711
12712typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012713 PyObject_HEAD
12714 Py_ssize_t it_index;
12715 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012716} unicodeiterobject;
12717
12718static void
12719unicodeiter_dealloc(unicodeiterobject *it)
12720{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012721 _PyObject_GC_UNTRACK(it);
12722 Py_XDECREF(it->it_seq);
12723 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012724}
12725
12726static int
12727unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12728{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012729 Py_VISIT(it->it_seq);
12730 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012731}
12732
12733static PyObject *
12734unicodeiter_next(unicodeiterobject *it)
12735{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012736 PyUnicodeObject *seq;
12737 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012738
Benjamin Peterson14339b62009-01-31 16:36:08 +000012739 assert(it != NULL);
12740 seq = it->it_seq;
12741 if (seq == NULL)
12742 return NULL;
12743 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12746 int kind = PyUnicode_KIND(seq);
12747 void *data = PyUnicode_DATA(seq);
12748 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12749 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012750 if (item != NULL)
12751 ++it->it_index;
12752 return item;
12753 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012754
Benjamin Peterson14339b62009-01-31 16:36:08 +000012755 Py_DECREF(seq);
12756 it->it_seq = NULL;
12757 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012758}
12759
12760static PyObject *
12761unicodeiter_len(unicodeiterobject *it)
12762{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012763 Py_ssize_t len = 0;
12764 if (it->it_seq)
12765 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12766 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012767}
12768
12769PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12770
12771static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012772 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012773 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012774 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012775};
12776
12777PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012778 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12779 "str_iterator", /* tp_name */
12780 sizeof(unicodeiterobject), /* tp_basicsize */
12781 0, /* tp_itemsize */
12782 /* methods */
12783 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12784 0, /* tp_print */
12785 0, /* tp_getattr */
12786 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012787 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012788 0, /* tp_repr */
12789 0, /* tp_as_number */
12790 0, /* tp_as_sequence */
12791 0, /* tp_as_mapping */
12792 0, /* tp_hash */
12793 0, /* tp_call */
12794 0, /* tp_str */
12795 PyObject_GenericGetAttr, /* tp_getattro */
12796 0, /* tp_setattro */
12797 0, /* tp_as_buffer */
12798 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12799 0, /* tp_doc */
12800 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12801 0, /* tp_clear */
12802 0, /* tp_richcompare */
12803 0, /* tp_weaklistoffset */
12804 PyObject_SelfIter, /* tp_iter */
12805 (iternextfunc)unicodeiter_next, /* tp_iternext */
12806 unicodeiter_methods, /* tp_methods */
12807 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012808};
12809
12810static PyObject *
12811unicode_iter(PyObject *seq)
12812{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012813 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012814
Benjamin Peterson14339b62009-01-31 16:36:08 +000012815 if (!PyUnicode_Check(seq)) {
12816 PyErr_BadInternalCall();
12817 return NULL;
12818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 if (PyUnicode_READY(seq) == -1)
12820 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012821 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12822 if (it == NULL)
12823 return NULL;
12824 it->it_index = 0;
12825 Py_INCREF(seq);
12826 it->it_seq = (PyUnicodeObject *)seq;
12827 _PyObject_GC_TRACK(it);
12828 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012829}
12830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831#define UNIOP(x) Py_UNICODE_##x
12832#define UNIOP_t Py_UNICODE
12833#include "uniops.h"
12834#undef UNIOP
12835#undef UNIOP_t
12836#define UNIOP(x) Py_UCS4_##x
12837#define UNIOP_t Py_UCS4
12838#include "uniops.h"
12839#undef UNIOP
12840#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012841
Victor Stinner71133ff2010-09-01 23:43:53 +000012842Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012843PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012844{
12845 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12846 Py_UNICODE *copy;
12847 Py_ssize_t size;
12848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012849 if (!PyUnicode_Check(unicode)) {
12850 PyErr_BadArgument();
12851 return NULL;
12852 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012853 /* Ensure we won't overflow the size. */
12854 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12855 PyErr_NoMemory();
12856 return NULL;
12857 }
12858 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12859 size *= sizeof(Py_UNICODE);
12860 copy = PyMem_Malloc(size);
12861 if (copy == NULL) {
12862 PyErr_NoMemory();
12863 return NULL;
12864 }
12865 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12866 return copy;
12867}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012868
Georg Brandl66c221e2010-10-14 07:04:07 +000012869/* A _string module, to export formatter_parser and formatter_field_name_split
12870 to the string.Formatter class implemented in Python. */
12871
12872static PyMethodDef _string_methods[] = {
12873 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12874 METH_O, PyDoc_STR("split the argument as a field name")},
12875 {"formatter_parser", (PyCFunction) formatter_parser,
12876 METH_O, PyDoc_STR("parse the argument as a format string")},
12877 {NULL, NULL}
12878};
12879
12880static struct PyModuleDef _string_module = {
12881 PyModuleDef_HEAD_INIT,
12882 "_string",
12883 PyDoc_STR("string helper module"),
12884 0,
12885 _string_methods,
12886 NULL,
12887 NULL,
12888 NULL,
12889 NULL
12890};
12891
12892PyMODINIT_FUNC
12893PyInit__string(void)
12894{
12895 return PyModule_Create(&_string_module);
12896}
12897
12898
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012899#ifdef __cplusplus
12900}
12901#endif