blob: ea88e21900ef24447444147e08ccc3aa7e10b408 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
109#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
110#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
111#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
112#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
113#define _PyUnicode_KIND(op) \
114 (assert(PyUnicode_Check(op)), \
115 ((PyASCIIObject *)(op))->state.kind)
116#define _PyUnicode_GET_LENGTH(op) \
117 (assert(PyUnicode_Check(op)), \
118 ((PyASCIIObject *)(op))->length)
119
120
Walter Dörwald16807132007-05-25 13:52:07 +0000121/* This dictionary holds all interned unicode strings. Note that references
122 to strings in this dictionary are *not* counted in the string's ob_refcnt.
123 When the interned string reaches a refcnt of 0 the string deallocation
124 function will delete the reference from this dictionary.
125
126 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000127 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000128*/
129static PyObject *interned;
130
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000131/* The empty Unicode object is shared to improve performance. */
132static PyUnicodeObject *unicode_empty;
133
134/* Single character Unicode strings in the Latin-1 range are being
135 shared as well. */
136static PyUnicodeObject *unicode_latin1[256];
137
Christian Heimes190d79e2008-01-30 11:58:22 +0000138/* Fast detection of the most frequent whitespace characters */
139const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000141/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000142/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000143/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000144/* case 0x000C: * FORM FEED */
145/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000146 0, 1, 1, 1, 1, 1, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000148/* case 0x001C: * FILE SEPARATOR */
149/* case 0x001D: * GROUP SEPARATOR */
150/* case 0x001E: * RECORD SEPARATOR */
151/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000152 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000154 1, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000158
Benjamin Peterson14339b62009-01-31 16:36:08 +0000159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000167};
168
Alexander Belopolsky40018472011-02-26 01:02:56 +0000169static PyObject *
170unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000171 PyObject **errorHandler,const char *encoding, const char *reason,
172 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
173 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
174
Alexander Belopolsky40018472011-02-26 01:02:56 +0000175static void
176raise_encode_exception(PyObject **exceptionObject,
177 const char *encoding,
178 const Py_UNICODE *unicode, Py_ssize_t size,
179 Py_ssize_t startpos, Py_ssize_t endpos,
180 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000181
Christian Heimes190d79e2008-01-30 11:58:22 +0000182/* Same for linebreaks */
183static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000185/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000186/* 0x000B, * LINE TABULATION */
187/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000188/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* 0x001C, * FILE SEPARATOR */
192/* 0x001D, * GROUP SEPARATOR */
193/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 0, 0, 0, 1, 1, 1, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
196 0, 0, 0, 0, 0, 0, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000199
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000208};
209
210
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000211Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000212PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000213{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000214#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000216#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 /* This is actually an illegal character, so it should
218 not be passed to unichr. */
219 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000220#endif
221}
222
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223/* --- Bloom Filters ----------------------------------------------------- */
224
225/* stuff to implement simple "bloom filters" for Unicode characters.
226 to keep things simple, we use a single bitmask, using the least 5
227 bits from each unicode characters as the bit index. */
228
229/* the linebreak mask is set up by Unicode_Init below */
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#if LONG_BIT >= 128
232#define BLOOM_WIDTH 128
233#elif LONG_BIT >= 64
234#define BLOOM_WIDTH 64
235#elif LONG_BIT >= 32
236#define BLOOM_WIDTH 32
237#else
238#error "LONG_BIT is smaller than 32"
239#endif
240
Thomas Wouters477c8d52006-05-27 19:21:47 +0000241#define BLOOM_MASK unsigned long
242
243static BLOOM_MASK bloom_linebreak;
244
Antoine Pitrouf068f942010-01-13 14:19:12 +0000245#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
246#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000247
Benjamin Peterson29060642009-01-31 22:14:21 +0000248#define BLOOM_LINEBREAK(ch) \
249 ((ch) < 128U ? ascii_linebreak[(ch)] : \
250 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000251
Alexander Belopolsky40018472011-02-26 01:02:56 +0000252Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200253make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000254{
255 /* calculate simple bloom-style bitmask for a given unicode string */
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000258 Py_ssize_t i;
259
260 mask = 0;
261 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200262 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
264 return mask;
265}
266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200267#define BLOOM_MEMBER(mask, chr, str) \
268 (BLOOM(mask, chr) \
269 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271/* --- Unicode Object ----------------------------------------------------- */
272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200273static PyObject *
274substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
275
276static PyObject *
277fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
278
279Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
280 Py_ssize_t size, Py_UCS4 ch,
281 int direction)
282{
283 /* like wcschr, but doesn't stop at NULL characters */
284 Py_ssize_t i;
285 if (direction == 1) {
286 for(i = 0; i < size; i++)
287 if (PyUnicode_READ(kind, s, i) == ch)
288 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
289 }
290 else {
291 for(i = size-1; i >= 0; i--)
292 if (PyUnicode_READ(kind, s, i) == ch)
293 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
294 }
295 return NULL;
296}
297
Alexander Belopolsky40018472011-02-26 01:02:56 +0000298static int
299unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200300 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
302 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200304 /* Resizing is only supported for old unicode objects. */
305 assert(!PyUnicode_IS_COMPACT(unicode));
306 assert(_PyUnicode_WSTR(unicode) != NULL);
307
308 /* ... and only if they have not been readied yet, because
309 callees usually rely on the wstr representation when resizing. */
310 assert(unicode->data.any == NULL);
311
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000312 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200313 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 /* Resizing shared object (unicode_empty or single character
317 objects) in-place is not allowed. Use PyUnicode_Resize()
318 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
322 _PyUnicode_WSTR(unicode)[0] < 256U &&
323 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000325 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 return -1;
327 }
328
Thomas Wouters477c8d52006-05-27 19:21:47 +0000329 /* We allocate one more byte to make sure the string is Ux0000 terminated.
330 The overallocation is also used by fastsearch, which assumes that it's
331 safe to look at str[length] (without making any assumptions about what
332 it contains). */
333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200334 oldstr = _PyUnicode_WSTR(unicode);
335 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
336 sizeof(Py_UNICODE) * (length + 1));
337 if (!_PyUnicode_WSTR(unicode)) {
338 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 PyErr_NoMemory();
340 return -1;
341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200342 _PyUnicode_WSTR(unicode)[length] = 0;
343 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000344
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 if (unicode->data.any != NULL) {
347 PyObject_FREE(unicode->data.any);
348 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
349 PyObject_FREE(unicode->_base.utf8);
350 }
351 unicode->_base.utf8 = NULL;
352 unicode->_base.utf8_length = 0;
353 unicode->data.any = NULL;
354 _PyUnicode_LENGTH(unicode) = 0;
355 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
356 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 _PyUnicode_HASH(unicode) = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000359
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 return 0;
361}
362
363/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000364 Ux0000 terminated; some code (e.g. new_identifier)
365 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366
367 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369
370*/
371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200372#ifdef Py_DEBUG
373int unicode_old_new_calls = 0;
374#endif
375
Alexander Belopolsky40018472011-02-26 01:02:56 +0000376static PyUnicodeObject *
377_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378{
379 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381
Thomas Wouters477c8d52006-05-27 19:21:47 +0000382 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383 if (length == 0 && unicode_empty != NULL) {
384 Py_INCREF(unicode_empty);
385 return unicode_empty;
386 }
387
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000388 /* Ensure we won't overflow the size. */
389 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
390 return (PyUnicodeObject *)PyErr_NoMemory();
391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 if (length < 0) {
393 PyErr_SetString(PyExc_SystemError,
394 "Negative size passed to _PyUnicode_New");
395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396 }
397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200398#ifdef Py_DEBUG
399 ++unicode_old_new_calls;
400#endif
401
402 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
403 if (unicode == NULL)
404 return NULL;
405 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
406 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
407 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 PyErr_NoMemory();
409 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200411
Jeremy Hyltond8082792003-09-16 19:41:39 +0000412 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000413 * the caller fails before initializing str -- unicode_resize()
414 * reads str[0], and the Keep-Alive optimization can keep memory
415 * allocated for str alive across a call to unicode_dealloc(unicode).
416 * We don't want unicode_resize to read uninitialized memory in
417 * that case.
418 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 _PyUnicode_WSTR(unicode)[0] = 0;
420 _PyUnicode_WSTR(unicode)[length] = 0;
421 _PyUnicode_WSTR_LENGTH(unicode) = length;
422 _PyUnicode_HASH(unicode) = -1;
423 _PyUnicode_STATE(unicode).interned = 0;
424 _PyUnicode_STATE(unicode).kind = 0;
425 _PyUnicode_STATE(unicode).compact = 0;
426 _PyUnicode_STATE(unicode).ready = 0;
427 _PyUnicode_STATE(unicode).ascii = 0;
428 unicode->data.any = NULL;
429 _PyUnicode_LENGTH(unicode) = 0;
430 unicode->_base.utf8 = NULL;
431 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000433
Benjamin Peterson29060642009-01-31 22:14:21 +0000434 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000435 /* XXX UNREF/NEWREF interface should be more symmetrical */
436 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000437 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000438 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440}
441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200442#ifdef Py_DEBUG
443int unicode_new_new_calls = 0;
444
445/* Functions wrapping macros for use in debugger */
446char *_PyUnicode_utf8(void *unicode){
447 return _PyUnicode_UTF8(unicode);
448}
449
450void *_PyUnicode_compact_data(void *unicode) {
451 return _PyUnicode_COMPACT_DATA(unicode);
452}
453void *_PyUnicode_data(void *unicode){
454 printf("obj %p\n", unicode);
455 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
456 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
457 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
458 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
459 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
460 return PyUnicode_DATA(unicode);
461}
462#endif
463
464PyObject *
465PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
466{
467 PyObject *obj;
468 PyCompactUnicodeObject *unicode;
469 void *data;
470 int kind_state;
471 int is_sharing = 0, is_ascii = 0;
472 Py_ssize_t char_size;
473 Py_ssize_t struct_size;
474
475 /* Optimization for empty strings */
476 if (size == 0 && unicode_empty != NULL) {
477 Py_INCREF(unicode_empty);
478 return (PyObject *)unicode_empty;
479 }
480
481#ifdef Py_DEBUG
482 ++unicode_new_new_calls;
483#endif
484
485 struct_size = sizeof(PyCompactUnicodeObject);
486 if (maxchar < 128) {
487 kind_state = PyUnicode_1BYTE_KIND;
488 char_size = 1;
489 is_ascii = 1;
490 struct_size = sizeof(PyASCIIObject);
491 }
492 else if (maxchar < 256) {
493 kind_state = PyUnicode_1BYTE_KIND;
494 char_size = 1;
495 }
496 else if (maxchar < 65536) {
497 kind_state = PyUnicode_2BYTE_KIND;
498 char_size = 2;
499 if (sizeof(wchar_t) == 2)
500 is_sharing = 1;
501 }
502 else {
503 kind_state = PyUnicode_4BYTE_KIND;
504 char_size = 4;
505 if (sizeof(wchar_t) == 4)
506 is_sharing = 1;
507 }
508
509 /* Ensure we won't overflow the size. */
510 if (size < 0) {
511 PyErr_SetString(PyExc_SystemError,
512 "Negative size passed to PyUnicode_New");
513 return NULL;
514 }
515 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
516 return PyErr_NoMemory();
517
518 /* Duplicated allocation code from _PyObject_New() instead of a call to
519 * PyObject_New() so we are able to allocate space for the object and
520 * it's data buffer.
521 */
522 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
523 if (obj == NULL)
524 return PyErr_NoMemory();
525 obj = PyObject_INIT(obj, &PyUnicode_Type);
526 if (obj == NULL)
527 return NULL;
528
529 unicode = (PyCompactUnicodeObject *)obj;
530 if (is_ascii)
531 data = ((PyASCIIObject*)obj) + 1;
532 else
533 data = unicode + 1;
534 _PyUnicode_LENGTH(unicode) = size;
535 _PyUnicode_HASH(unicode) = -1;
536 _PyUnicode_STATE(unicode).interned = 0;
537 _PyUnicode_STATE(unicode).kind = kind_state;
538 _PyUnicode_STATE(unicode).compact = 1;
539 _PyUnicode_STATE(unicode).ready = 1;
540 _PyUnicode_STATE(unicode).ascii = is_ascii;
541 if (is_ascii) {
542 ((char*)data)[size] = 0;
543 _PyUnicode_WSTR(unicode) = NULL;
544 }
545 else if (kind_state == PyUnicode_1BYTE_KIND) {
546 ((char*)data)[size] = 0;
547 _PyUnicode_WSTR(unicode) = NULL;
548 _PyUnicode_WSTR_LENGTH(unicode) = 0;
549 unicode->utf8_length = 0;
550 unicode->utf8 = NULL;
551 }
552 else {
553 unicode->utf8 = NULL;
554 if (kind_state == PyUnicode_2BYTE_KIND)
555 ((Py_UCS2*)data)[size] = 0;
556 else /* kind_state == PyUnicode_4BYTE_KIND */
557 ((Py_UCS4*)data)[size] = 0;
558 if (is_sharing) {
559 _PyUnicode_WSTR_LENGTH(unicode) = size;
560 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
561 }
562 else {
563 _PyUnicode_WSTR_LENGTH(unicode) = 0;
564 _PyUnicode_WSTR(unicode) = NULL;
565 }
566 }
567 return obj;
568}
569
570#if SIZEOF_WCHAR_T == 2
571/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
572 will decode surrogate pairs, the other conversions are implemented as macros
573 for efficency.
574
575 This function assumes that unicode can hold one more code point than wstr
576 characters for a terminating null character. */
577static int
578unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
579 PyUnicodeObject *unicode)
580{
581 const wchar_t *iter;
582 Py_UCS4 *ucs4_out;
583
584 assert(unicode && PyUnicode_Check(unicode));
585 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
586 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
587
588 for (iter = begin; iter < end; ) {
589 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
590 _PyUnicode_GET_LENGTH(unicode)));
591 if (*iter >= 0xD800 && *iter <= 0xDBFF
592 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
593 {
594 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
595 iter += 2;
596 }
597 else {
598 *ucs4_out++ = *iter;
599 iter++;
600 }
601 }
602 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
603 _PyUnicode_GET_LENGTH(unicode)));
604
605 return 0;
606}
607#endif
608
609int
610PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
611 PyObject *from, Py_ssize_t from_start,
612 Py_ssize_t how_many)
613{
614 int from_kind;
615 int to_kind;
616
617 assert(PyUnicode_Check(from));
618 assert(PyUnicode_Check(to));
619
620 if (PyUnicode_READY(from))
621 return -1;
622 if (PyUnicode_READY(to))
623 return -1;
624
625 from_kind = PyUnicode_KIND(from);
626 to_kind = PyUnicode_KIND(to);
627
628 if (from_kind == to_kind) {
629 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(to);
630 Py_MEMCPY(PyUnicode_1BYTE_DATA(to) + (to_start * char_size),
631 PyUnicode_1BYTE_DATA(from) + (from_start * char_size),
632 how_many * char_size);
633 return 0;
634 }
635
636 switch (from_kind) {
637 case PyUnicode_1BYTE_KIND:
638 switch (to_kind) {
639 case PyUnicode_2BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200640 _PyUnicode_CONVERT_BYTES(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641 unsigned char, Py_UCS2,
642 PyUnicode_1BYTE_DATA(from) + from_start,
643 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
644 PyUnicode_2BYTE_DATA(to) + to_start
645 );
646 break;
647 case PyUnicode_4BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200648 _PyUnicode_CONVERT_BYTES(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649 unsigned char, Py_UCS4,
650 PyUnicode_1BYTE_DATA(from) + from_start,
651 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
652 PyUnicode_4BYTE_DATA(to) + to_start
653 );
654 break;
655 default:
656 goto invalid_state;
657 }
658 break;
659 case PyUnicode_2BYTE_KIND:
660 switch (to_kind) {
661 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200662 _PyUnicode_CONVERT_BYTES(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663 Py_UCS2, unsigned char,
664 PyUnicode_2BYTE_DATA(from) + from_start,
665 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
666 PyUnicode_1BYTE_DATA(to) + to_start
667 );
668 break;
669 case PyUnicode_4BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200670 _PyUnicode_CONVERT_BYTES(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200671 Py_UCS2, Py_UCS4,
672 PyUnicode_2BYTE_DATA(from) + from_start,
673 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
674 PyUnicode_4BYTE_DATA(to) + to_start
675 );
676 break;
677 default:
678 goto invalid_state;
679 }
680 break;
681 case PyUnicode_4BYTE_KIND:
682 switch (to_kind) {
683 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200684 _PyUnicode_CONVERT_BYTES(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200685 Py_UCS4, unsigned char,
686 PyUnicode_4BYTE_DATA(from) + from_start,
687 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
688 PyUnicode_1BYTE_DATA(to) + to_start
689 );
690 break;
691 case PyUnicode_2BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200692 _PyUnicode_CONVERT_BYTES(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200693 Py_UCS4, Py_UCS2,
694 PyUnicode_4BYTE_DATA(from) + from_start,
695 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
696 PyUnicode_2BYTE_DATA(to) + to_start
697 );
698 break;
699 default:
700 goto invalid_state;
701 }
702 break;
703 default:
704 goto invalid_state;
705 }
706 return 0;
707
708invalid_state:
709 PyErr_Format(PyExc_ValueError,
710 "Impossible kind state (from=%i, to=%i) "
711 "in PyUnicode_CopyCharacters",
712 from_kind, to_kind);
713 return -1;
714}
715
716int
717_PyUnicode_FindMaxCharAndNumSurrogatePairs(const wchar_t *begin,
718 const wchar_t *end,
719 Py_UCS4 *maxchar,
720 Py_ssize_t *num_surrogates)
721{
722 const wchar_t *iter;
723
724 if (num_surrogates == NULL || maxchar == NULL) {
725 PyErr_SetString(PyExc_SystemError,
726 "unexpected NULL arguments to "
727 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
728 return -1;
729 }
730
731 *num_surrogates = 0;
732 *maxchar = 0;
733
734 for (iter = begin; iter < end; ) {
735 if (*iter > *maxchar)
736 *maxchar = *iter;
737#if SIZEOF_WCHAR_T == 2
738 if (*iter >= 0xD800 && *iter <= 0xDBFF
739 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
740 {
741 Py_UCS4 surrogate_val;
742 surrogate_val = (((iter[0] & 0x3FF)<<10)
743 | (iter[1] & 0x3FF)) + 0x10000;
744 ++(*num_surrogates);
745 if (surrogate_val > *maxchar)
746 *maxchar = surrogate_val;
747 iter += 2;
748 }
749 else
750 iter++;
751#else
752 iter++;
753#endif
754 }
755 return 0;
756}
757
758#ifdef Py_DEBUG
759int unicode_ready_calls = 0;
760#endif
761
762int
763_PyUnicode_Ready(PyUnicodeObject *unicode)
764{
765 wchar_t *end;
766 Py_UCS4 maxchar = 0;
767 Py_ssize_t num_surrogates;
768#if SIZEOF_WCHAR_T == 2
769 Py_ssize_t length_wo_surrogates;
770#endif
771
772 assert(PyUnicode_Check(unicode));
773
774 if (unicode->data.any != NULL) {
775 assert(PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
776 return 0;
777 }
778
779 /* _PyUnicode_Ready() is only intented for old-style API usage where
780 * strings were created using _PyObject_New() and where no canonical
781 * representation (the str field) has been set yet aka strings
782 * which are not yet ready.
783 */
784 assert(_PyUnicode_WSTR(unicode) != NULL);
785 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
786 assert(!PyUnicode_IS_COMPACT(unicode));
787 assert(!PyUnicode_IS_READY(unicode));
788 /* Actually, it should neither be interned nor be anything else: */
789 assert(_PyUnicode_STATE(unicode).interned == 0);
790 assert(unicode->_base.utf8 == NULL);
791
792#ifdef Py_DEBUG
793 ++unicode_ready_calls;
794#endif
795
796 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
797 if (_PyUnicode_FindMaxCharAndNumSurrogatePairs(_PyUnicode_WSTR(unicode), end,
798 &maxchar,
799 &num_surrogates) == -1) {
800 assert(0 && "PyUnicode_FindMaxCharAndNumSurrogatePairs failed");
801 return -1;
802 }
803
804 if (maxchar < 256) {
805 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
806 if (!unicode->data.any) {
807 PyErr_NoMemory();
808 return -1;
809 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200810 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200811 _PyUnicode_WSTR(unicode), end,
812 PyUnicode_1BYTE_DATA(unicode));
813 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
814 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
815 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
816 if (maxchar < 128) {
817 unicode->_base.utf8 = unicode->data.any;
818 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
819 }
820 else {
821 unicode->_base.utf8 = NULL;
822 unicode->_base.utf8_length = 0;
823 }
824 PyObject_FREE(_PyUnicode_WSTR(unicode));
825 _PyUnicode_WSTR(unicode) = NULL;
826 _PyUnicode_WSTR_LENGTH(unicode) = 0;
827 }
828 /* In this case we might have to convert down from 4-byte native
829 wchar_t to 2-byte unicode. */
830 else if (maxchar < 65536) {
831 assert(num_surrogates == 0 &&
832 "FindMaxCharAndNumSurrogatePairs() messed up");
833
834 if (sizeof(wchar_t) == 2) {
835 /* We can share representations and are done. */
836 unicode->data.any = _PyUnicode_WSTR(unicode);
837 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
838 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
839 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
840 unicode->_base.utf8 = NULL;
841 unicode->_base.utf8_length = 0;
842 }
843 else {
844 assert(sizeof(wchar_t) == 4);
845
846 unicode->data.any = PyObject_MALLOC(
847 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
848 if (!unicode->data.any) {
849 PyErr_NoMemory();
850 return -1;
851 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200852 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200853 _PyUnicode_WSTR(unicode), end,
854 PyUnicode_2BYTE_DATA(unicode));
855 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
856 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
857 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
858 unicode->_base.utf8 = NULL;
859 unicode->_base.utf8_length = 0;
860 PyObject_FREE(_PyUnicode_WSTR(unicode));
861 _PyUnicode_WSTR(unicode) = NULL;
862 _PyUnicode_WSTR_LENGTH(unicode) = 0;
863 }
864 }
865 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
866 else {
867#if SIZEOF_WCHAR_T == 2
868 /* in case the native representation is 2-bytes, we need to allocate a
869 new normalized 4-byte version. */
870 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
871 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
872 if (!unicode->data.any) {
873 PyErr_NoMemory();
874 return -1;
875 }
876 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
877 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
878 unicode->_base.utf8 = NULL;
879 unicode->_base.utf8_length = 0;
880 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
881 unicode) < 0) {
882 assert(0 && "ConvertWideCharToUCS4 failed");
883 return -1;
884 }
885 PyObject_FREE(_PyUnicode_WSTR(unicode));
886 _PyUnicode_WSTR(unicode) = NULL;
887 _PyUnicode_WSTR_LENGTH(unicode) = 0;
888#else
889 assert(num_surrogates == 0);
890
891 unicode->data.any = _PyUnicode_WSTR(unicode);
892 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
893 unicode->_base.utf8 = NULL;
894 unicode->_base.utf8_length = 0;
895 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
896#endif
897 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
898 }
899 _PyUnicode_STATE(unicode).ready = 1;
900 return 0;
901}
902
Alexander Belopolsky40018472011-02-26 01:02:56 +0000903static void
904unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000905{
Walter Dörwald16807132007-05-25 13:52:07 +0000906 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000907 case SSTATE_NOT_INTERNED:
908 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000909
Benjamin Peterson29060642009-01-31 22:14:21 +0000910 case SSTATE_INTERNED_MORTAL:
911 /* revive dead object temporarily for DelItem */
912 Py_REFCNT(unicode) = 3;
913 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
914 Py_FatalError(
915 "deletion of interned string failed");
916 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000917
Benjamin Peterson29060642009-01-31 22:14:21 +0000918 case SSTATE_INTERNED_IMMORTAL:
919 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000920
Benjamin Peterson29060642009-01-31 22:14:21 +0000921 default:
922 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000923 }
924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200925 if (_PyUnicode_WSTR(unicode) &&
926 (!PyUnicode_IS_READY(unicode) ||
927 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
928 PyObject_DEL(_PyUnicode_WSTR(unicode));
929 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
930 PyObject_DEL(unicode->_base.utf8);
931
932 if (PyUnicode_IS_COMPACT(unicode)) {
933 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934 }
935 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936 if (unicode->data.any)
937 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000938 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000939 }
940}
941
Alexander Belopolsky40018472011-02-26 01:02:56 +0000942static int
943_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000944{
945 register PyUnicodeObject *v;
946
947 /* Argument checks */
948 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000949 PyErr_BadInternalCall();
950 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000951 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000952 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200953 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
954 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000955 PyErr_BadInternalCall();
956 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000957 }
958
959 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200960 possible since these are being shared.
961 The same goes for new-representation unicode objects or objects which
962 have already been readied.
963 For these, we simply return a fresh copy with the same Unicode content.
964 */
965 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
966 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
967 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000968 PyUnicodeObject *w = _PyUnicode_New(length);
969 if (w == NULL)
970 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
972 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000973 Py_DECREF(*unicode);
974 *unicode = w;
975 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000976 }
977
978 /* Note that we don't have to modify *unicode for unshared Unicode
979 objects, since we can modify them in-place. */
980 return unicode_resize(v, length);
981}
982
Alexander Belopolsky40018472011-02-26 01:02:56 +0000983int
984PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000985{
986 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
987}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989static PyObject*
990get_latin1_char(unsigned char ch)
991{
992 PyUnicodeObject *unicode = unicode_latin1[ch];
993 if (!unicode) {
994 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
995 if (!unicode)
996 return NULL;
997 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
998 unicode_latin1[ch] = unicode;
999 }
1000 Py_INCREF(unicode);
1001 return (PyObject *)unicode;
1002}
1003
Alexander Belopolsky40018472011-02-26 01:02:56 +00001004PyObject *
1005PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001006{
1007 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001008 Py_UCS4 maxchar = 0;
1009 Py_ssize_t num_surrogates;
1010
1011 if (u == NULL)
1012 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001013
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001014 /* If the Unicode data is known at construction time, we can apply
1015 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001017 /* Optimization for empty strings */
1018 if (size == 0 && unicode_empty != NULL) {
1019 Py_INCREF(unicode_empty);
1020 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001021 }
Tim Petersced69f82003-09-16 20:30:58 +00001022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001023 /* Single character Unicode objects in the Latin-1 range are
1024 shared when using this constructor */
1025 if (size == 1 && *u < 256)
1026 return get_latin1_char((unsigned char)*u);
1027
1028 /* If not empty and not single character, copy the Unicode data
1029 into the new object */
1030 if (_PyUnicode_FindMaxCharAndNumSurrogatePairs(u, u + size, &maxchar,
1031 &num_surrogates) == -1)
1032 return NULL;
1033
1034 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1035 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 if (!unicode)
1037 return NULL;
1038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001039 switch (PyUnicode_KIND(unicode)) {
1040 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001041 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001042 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1043 break;
1044 case PyUnicode_2BYTE_KIND:
1045#if Py_UNICODE_SIZE == 2
1046 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1047#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001048 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1050#endif
1051 break;
1052 case PyUnicode_4BYTE_KIND:
1053#if SIZEOF_WCHAR_T == 2
1054 /* This is the only case which has to process surrogates, thus
1055 a simple copy loop is not enough and we need a function. */
1056 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1057 Py_DECREF(unicode);
1058 return NULL;
1059 }
1060#else
1061 assert(num_surrogates == 0);
1062 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1063#endif
1064 break;
1065 default:
1066 assert(0 && "Impossible state");
1067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068
1069 return (PyObject *)unicode;
1070}
1071
Alexander Belopolsky40018472011-02-26 01:02:56 +00001072PyObject *
1073PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001074{
1075 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001076
Benjamin Peterson14339b62009-01-31 16:36:08 +00001077 if (size < 0) {
1078 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001079 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001080 return NULL;
1081 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001082
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001083 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001084 some optimizations which share commonly used objects.
1085 Also, this means the input must be UTF-8, so fall back to the
1086 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001087 if (u != NULL) {
1088
Benjamin Peterson29060642009-01-31 22:14:21 +00001089 /* Optimization for empty strings */
1090 if (size == 0 && unicode_empty != NULL) {
1091 Py_INCREF(unicode_empty);
1092 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001093 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001094
1095 /* Single characters are shared when using this constructor.
1096 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 if (size == 1 && Py_CHARMASK(*u) < 128)
1098 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001099
1100 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001101 }
1102
Walter Dörwald55507312007-05-18 13:12:10 +00001103 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001104 if (!unicode)
1105 return NULL;
1106
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001107 return (PyObject *)unicode;
1108}
1109
Alexander Belopolsky40018472011-02-26 01:02:56 +00001110PyObject *
1111PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001112{
1113 size_t size = strlen(u);
1114 if (size > PY_SSIZE_T_MAX) {
1115 PyErr_SetString(PyExc_OverflowError, "input too long");
1116 return NULL;
1117 }
1118
1119 return PyUnicode_FromStringAndSize(u, size);
1120}
1121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122PyObject*
1123PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001124{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 PyObject *res;
1126 unsigned char max = 127;
1127 Py_ssize_t i;
1128 for (i = 0; i < size; i++) {
1129 if (u[i] & 0x80) {
1130 max = 255;
1131 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001132 }
1133 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 res = PyUnicode_New(size, max);
1135 if (!res)
1136 return NULL;
1137 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1138 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001139}
1140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141PyObject*
1142PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1143{
1144 PyObject *res;
1145 Py_UCS2 max = 0;
1146 Py_ssize_t i;
1147 for (i = 0; i < size; i++)
1148 if (u[i] > max)
1149 max = u[i];
1150 res = PyUnicode_New(size, max);
1151 if (!res)
1152 return NULL;
1153 if (max >= 256)
1154 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1155 else
1156 for (i = 0; i < size; i++)
1157 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1158 return res;
1159}
1160
1161PyObject*
1162PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1163{
1164 PyObject *res;
1165 Py_UCS4 max = 0;
1166 Py_ssize_t i;
1167 for (i = 0; i < size; i++)
1168 if (u[i] > max)
1169 max = u[i];
1170 res = PyUnicode_New(size, max);
1171 if (!res)
1172 return NULL;
1173 if (max >= 0x10000)
1174 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1175 else {
1176 int kind = PyUnicode_KIND(res);
1177 void *data = PyUnicode_DATA(res);
1178 for (i = 0; i < size; i++)
1179 PyUnicode_WRITE(kind, data, i, u[i]);
1180 }
1181 return res;
1182}
1183
1184PyObject*
1185PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1186{
1187 switch(kind) {
1188 case PyUnicode_1BYTE_KIND:
1189 return PyUnicode_FromUCS1(buffer, size);
1190 case PyUnicode_2BYTE_KIND:
1191 return PyUnicode_FromUCS2(buffer, size);
1192 case PyUnicode_4BYTE_KIND:
1193 return PyUnicode_FromUCS4(buffer, size);
1194 }
1195 assert(0);
1196 return NULL;
1197}
1198
1199
1200/* Widen Unicode objects to larger buffers.
1201 Return NULL if the string is too wide already. */
1202
1203void*
1204_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1205{
1206 Py_ssize_t i;
1207 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1208 void *d = PyUnicode_DATA(s);
1209 unsigned int skind = PyUnicode_KIND(s);
1210 if (PyUnicode_KIND(s) >= kind) {
1211 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1212 return NULL;
1213 }
1214 switch(kind) {
1215 case PyUnicode_2BYTE_KIND: {
1216 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1217 if (!result) {
1218 PyErr_NoMemory();
1219 return 0;
1220 }
1221 for (i = 0; i < len; i++)
1222 result[i] = ((Py_UCS1*)d)[i];
1223 return result;
1224 }
1225 case PyUnicode_4BYTE_KIND: {
1226 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1227 if (!result) {
1228 PyErr_NoMemory();
1229 return 0;
1230 }
1231 for (i = 0; i < len; i++)
1232 result[i] = PyUnicode_READ(skind, d, i);
1233 return result;
1234 }
1235 }
1236 Py_FatalError("invalid kind");
1237 return NULL;
1238}
1239
1240static Py_UCS4*
1241as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1242 int copy_null)
1243{
1244 int kind;
1245 void *data;
1246 Py_ssize_t len, targetlen;
1247 if (PyUnicode_READY(string) == -1)
1248 return NULL;
1249 kind = PyUnicode_KIND(string);
1250 data = PyUnicode_DATA(string);
1251 len = PyUnicode_GET_LENGTH(string);
1252 targetlen = len;
1253 if (copy_null)
1254 targetlen++;
1255 if (!target) {
1256 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1257 PyErr_NoMemory();
1258 return NULL;
1259 }
1260 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1261 if (!target) {
1262 PyErr_NoMemory();
1263 return NULL;
1264 }
1265 }
1266 else {
1267 if (targetsize < targetlen) {
1268 PyErr_Format(PyExc_SystemError,
1269 "string is longer than the buffer");
1270 if (copy_null && 0 < targetsize)
1271 target[0] = 0;
1272 return NULL;
1273 }
1274 }
1275 if (kind != PyUnicode_4BYTE_KIND) {
1276 Py_ssize_t i;
1277 for (i = 0; i < len; i++)
1278 target[i] = PyUnicode_READ(kind, data, i);
1279 }
1280 else
1281 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1282 if (copy_null)
1283 target[len] = 0;
1284 return target;
1285}
1286
1287Py_UCS4*
1288PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1289 int copy_null)
1290{
1291 if (target == NULL || targetsize < 1) {
1292 PyErr_BadInternalCall();
1293 return NULL;
1294 }
1295 return as_ucs4(string, target, targetsize, copy_null);
1296}
1297
1298Py_UCS4*
1299PyUnicode_AsUCS4Copy(PyObject *string)
1300{
1301 return as_ucs4(string, NULL, 0, 1);
1302}
1303
1304#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001305
Alexander Belopolsky40018472011-02-26 01:02:56 +00001306PyObject *
1307PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001309 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001310 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001312 PyErr_BadInternalCall();
1313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314 }
1315
Martin v. Löwis790465f2008-04-05 20:41:37 +00001316 if (size == -1) {
1317 size = wcslen(w);
1318 }
1319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321}
1322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001324
Walter Dörwald346737f2007-05-31 10:44:43 +00001325static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001326makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1327 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001328{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001329 *fmt++ = '%';
1330 if (width) {
1331 if (zeropad)
1332 *fmt++ = '0';
1333 fmt += sprintf(fmt, "%d", width);
1334 }
1335 if (precision)
1336 fmt += sprintf(fmt, ".%d", precision);
1337 if (longflag)
1338 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001339 else if (longlongflag) {
1340 /* longlongflag should only ever be nonzero on machines with
1341 HAVE_LONG_LONG defined */
1342#ifdef HAVE_LONG_LONG
1343 char *f = PY_FORMAT_LONG_LONG;
1344 while (*f)
1345 *fmt++ = *f++;
1346#else
1347 /* we shouldn't ever get here */
1348 assert(0);
1349 *fmt++ = 'l';
1350#endif
1351 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001352 else if (size_tflag) {
1353 char *f = PY_FORMAT_SIZE_T;
1354 while (*f)
1355 *fmt++ = *f++;
1356 }
1357 *fmt++ = c;
1358 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001359}
1360
Victor Stinner96865452011-03-01 23:44:09 +00001361/* helper for PyUnicode_FromFormatV() */
1362
1363static const char*
1364parse_format_flags(const char *f,
1365 int *p_width, int *p_precision,
1366 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1367{
1368 int width, precision, longflag, longlongflag, size_tflag;
1369
1370 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1371 f++;
1372 width = 0;
1373 while (Py_ISDIGIT((unsigned)*f))
1374 width = (width*10) + *f++ - '0';
1375 precision = 0;
1376 if (*f == '.') {
1377 f++;
1378 while (Py_ISDIGIT((unsigned)*f))
1379 precision = (precision*10) + *f++ - '0';
1380 if (*f == '%') {
1381 /* "%.3%s" => f points to "3" */
1382 f--;
1383 }
1384 }
1385 if (*f == '\0') {
1386 /* bogus format "%.1" => go backward, f points to "1" */
1387 f--;
1388 }
1389 if (p_width != NULL)
1390 *p_width = width;
1391 if (p_precision != NULL)
1392 *p_precision = precision;
1393
1394 /* Handle %ld, %lu, %lld and %llu. */
1395 longflag = 0;
1396 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001397 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001398
1399 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001400 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001401 longflag = 1;
1402 ++f;
1403 }
1404#ifdef HAVE_LONG_LONG
1405 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001406 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001407 longlongflag = 1;
1408 f += 2;
1409 }
1410#endif
1411 }
1412 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001413 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001414 size_tflag = 1;
1415 ++f;
1416 }
1417 if (p_longflag != NULL)
1418 *p_longflag = longflag;
1419 if (p_longlongflag != NULL)
1420 *p_longlongflag = longlongflag;
1421 if (p_size_tflag != NULL)
1422 *p_size_tflag = size_tflag;
1423 return f;
1424}
1425
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001426/* maximum number of characters required for output of %ld. 21 characters
1427 allows for 64-bit integers (in decimal) and an optional sign. */
1428#define MAX_LONG_CHARS 21
1429/* maximum number of characters required for output of %lld.
1430 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1431 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1432#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1433
Walter Dörwaldd2034312007-05-18 16:29:38 +00001434PyObject *
1435PyUnicode_FromFormatV(const char *format, va_list vargs)
1436{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001437 va_list count;
1438 Py_ssize_t callcount = 0;
1439 PyObject **callresults = NULL;
1440 PyObject **callresult = NULL;
1441 Py_ssize_t n = 0;
1442 int width = 0;
1443 int precision = 0;
1444 int zeropad;
1445 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001447 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001448 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1450 Py_UCS4 argmaxchar;
1451 Py_ssize_t numbersize = 0;
1452 char *numberresults = NULL;
1453 char *numberresult = NULL;
1454 Py_ssize_t i;
1455 int kind;
1456 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001457
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001458 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001459 /* step 1: count the number of %S/%R/%A/%s format specifications
1460 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1461 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 * result in an array)
1463 * also esimate a upper bound for all the number formats in the string,
1464 * numbers will be formated in step 3 and be keept in a '\0'-separated
1465 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001466 for (f = format; *f; f++) {
1467 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001468 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1470 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1471 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1472 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001475#ifdef HAVE_LONG_LONG
1476 if (longlongflag) {
1477 if (width < MAX_LONG_LONG_CHARS)
1478 width = MAX_LONG_LONG_CHARS;
1479 }
1480 else
1481#endif
1482 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1483 including sign. Decimal takes the most space. This
1484 isn't enough for octal. If a width is specified we
1485 need more (which we allocate later). */
1486 if (width < MAX_LONG_CHARS)
1487 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488
1489 /* account for the size + '\0' to separate numbers
1490 inside of the numberresults buffer */
1491 numbersize += (width + 1);
1492 }
1493 }
1494 else if ((unsigned char)*f > 127) {
1495 PyErr_Format(PyExc_ValueError,
1496 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1497 "string, got a non-ASCII byte: 0x%02x",
1498 (unsigned char)*f);
1499 return NULL;
1500 }
1501 }
1502 /* step 2: allocate memory for the results of
1503 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1504 if (callcount) {
1505 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1506 if (!callresults) {
1507 PyErr_NoMemory();
1508 return NULL;
1509 }
1510 callresult = callresults;
1511 }
1512 /* step 2.5: allocate memory for the results of formating numbers */
1513 if (numbersize) {
1514 numberresults = PyObject_Malloc(numbersize);
1515 if (!numberresults) {
1516 PyErr_NoMemory();
1517 goto fail;
1518 }
1519 numberresult = numberresults;
1520 }
1521
1522 /* step 3: format numbers and figure out how large a buffer we need */
1523 for (f = format; *f; f++) {
1524 if (*f == '%') {
1525 const char* p;
1526 int longflag;
1527 int longlongflag;
1528 int size_tflag;
1529 int numprinted;
1530
1531 p = f;
1532 zeropad = (f[1] == '0');
1533 f = parse_format_flags(f, &width, &precision,
1534 &longflag, &longlongflag, &size_tflag);
1535 switch (*f) {
1536 case 'c':
1537 {
1538 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001539 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 n++;
1541 break;
1542 }
1543 case '%':
1544 n++;
1545 break;
1546 case 'i':
1547 case 'd':
1548 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1549 width, precision, *f);
1550 if (longflag)
1551 numprinted = sprintf(numberresult, fmt,
1552 va_arg(count, long));
1553#ifdef HAVE_LONG_LONG
1554 else if (longlongflag)
1555 numprinted = sprintf(numberresult, fmt,
1556 va_arg(count, PY_LONG_LONG));
1557#endif
1558 else if (size_tflag)
1559 numprinted = sprintf(numberresult, fmt,
1560 va_arg(count, Py_ssize_t));
1561 else
1562 numprinted = sprintf(numberresult, fmt,
1563 va_arg(count, int));
1564 n += numprinted;
1565 /* advance by +1 to skip over the '\0' */
1566 numberresult += (numprinted + 1);
1567 assert(*(numberresult - 1) == '\0');
1568 assert(*(numberresult - 2) != '\0');
1569 assert(numprinted >= 0);
1570 assert(numberresult <= numberresults + numbersize);
1571 break;
1572 case 'u':
1573 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1574 width, precision, 'u');
1575 if (longflag)
1576 numprinted = sprintf(numberresult, fmt,
1577 va_arg(count, unsigned long));
1578#ifdef HAVE_LONG_LONG
1579 else if (longlongflag)
1580 numprinted = sprintf(numberresult, fmt,
1581 va_arg(count, unsigned PY_LONG_LONG));
1582#endif
1583 else if (size_tflag)
1584 numprinted = sprintf(numberresult, fmt,
1585 va_arg(count, size_t));
1586 else
1587 numprinted = sprintf(numberresult, fmt,
1588 va_arg(count, unsigned int));
1589 n += numprinted;
1590 numberresult += (numprinted + 1);
1591 assert(*(numberresult - 1) == '\0');
1592 assert(*(numberresult - 2) != '\0');
1593 assert(numprinted >= 0);
1594 assert(numberresult <= numberresults + numbersize);
1595 break;
1596 case 'x':
1597 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1598 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1599 n += numprinted;
1600 numberresult += (numprinted + 1);
1601 assert(*(numberresult - 1) == '\0');
1602 assert(*(numberresult - 2) != '\0');
1603 assert(numprinted >= 0);
1604 assert(numberresult <= numberresults + numbersize);
1605 break;
1606 case 'p':
1607 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1608 /* %p is ill-defined: ensure leading 0x. */
1609 if (numberresult[1] == 'X')
1610 numberresult[1] = 'x';
1611 else if (numberresult[1] != 'x') {
1612 memmove(numberresult + 2, numberresult,
1613 strlen(numberresult) + 1);
1614 numberresult[0] = '0';
1615 numberresult[1] = 'x';
1616 numprinted += 2;
1617 }
1618 n += numprinted;
1619 numberresult += (numprinted + 1);
1620 assert(*(numberresult - 1) == '\0');
1621 assert(*(numberresult - 2) != '\0');
1622 assert(numprinted >= 0);
1623 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001624 break;
1625 case 's':
1626 {
1627 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001628 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001629 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1630 if (!str)
1631 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632 /* since PyUnicode_DecodeUTF8 returns already flexible
1633 unicode objects, there is no need to call ready on them */
1634 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001635 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001637 /* Remember the str and switch to the next slot */
1638 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001639 break;
1640 }
1641 case 'U':
1642 {
1643 PyObject *obj = va_arg(count, PyObject *);
1644 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001645 if (PyUnicode_READY(obj) == -1)
1646 goto fail;
1647 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001648 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001649 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001650 break;
1651 }
1652 case 'V':
1653 {
1654 PyObject *obj = va_arg(count, PyObject *);
1655 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001656 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001657 assert(obj || str);
1658 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001659 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 if (PyUnicode_READY(obj) == -1)
1661 goto fail;
1662 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001663 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001664 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001665 *callresult++ = NULL;
1666 }
1667 else {
1668 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1669 if (!str_obj)
1670 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001672 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001674 *callresult++ = str_obj;
1675 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001676 break;
1677 }
1678 case 'S':
1679 {
1680 PyObject *obj = va_arg(count, PyObject *);
1681 PyObject *str;
1682 assert(obj);
1683 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001685 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001686 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001687 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001689 /* Remember the str and switch to the next slot */
1690 *callresult++ = str;
1691 break;
1692 }
1693 case 'R':
1694 {
1695 PyObject *obj = va_arg(count, PyObject *);
1696 PyObject *repr;
1697 assert(obj);
1698 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001700 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001702 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001703 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001704 /* Remember the repr and switch to the next slot */
1705 *callresult++ = repr;
1706 break;
1707 }
1708 case 'A':
1709 {
1710 PyObject *obj = va_arg(count, PyObject *);
1711 PyObject *ascii;
1712 assert(obj);
1713 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001715 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001717 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001719 /* Remember the repr and switch to the next slot */
1720 *callresult++ = ascii;
1721 break;
1722 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001723 default:
1724 /* if we stumble upon an unknown
1725 formatting code, copy the rest of
1726 the format string to the output
1727 string. (we cannot just skip the
1728 code, since there's no way to know
1729 what's in the argument list) */
1730 n += strlen(p);
1731 goto expand;
1732 }
1733 } else
1734 n++;
1735 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001736 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001737 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001739 we don't have to resize the string.
1740 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001742 if (!string)
1743 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 kind = PyUnicode_KIND(string);
1745 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001746 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001747 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001750 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001751 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001752
1753 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1755 /* checking for == because the last argument could be a empty
1756 string, which causes i to point to end, the assert at the end of
1757 the loop */
1758 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001759
Benjamin Peterson14339b62009-01-31 16:36:08 +00001760 switch (*f) {
1761 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001762 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 const int ordinal = va_arg(vargs, int);
1764 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001765 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001766 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001767 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001768 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001769 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001770 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 case 'p':
1772 /* unused, since we already have the result */
1773 if (*f == 'p')
1774 (void) va_arg(vargs, void *);
1775 else
1776 (void) va_arg(vargs, int);
1777 /* extract the result from numberresults and append. */
1778 for (; *numberresult; ++i, ++numberresult)
1779 PyUnicode_WRITE(kind, data, i, *numberresult);
1780 /* skip over the separating '\0' */
1781 assert(*numberresult == '\0');
1782 numberresult++;
1783 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001784 break;
1785 case 's':
1786 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001787 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001789 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 size = PyUnicode_GET_LENGTH(*callresult);
1791 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
1792 PyUnicode_CopyCharacters((PyObject*)string, i,
1793 *callresult, 0,
1794 size);
1795 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001796 /* We're done with the unicode()/repr() => forget it */
1797 Py_DECREF(*callresult);
1798 /* switch to next unicode()/repr() result */
1799 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001800 break;
1801 }
1802 case 'U':
1803 {
1804 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 Py_ssize_t size;
1806 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1807 size = PyUnicode_GET_LENGTH(obj);
1808 PyUnicode_CopyCharacters((PyObject*)string, i,
1809 obj, 0,
1810 size);
1811 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001812 break;
1813 }
1814 case 'V':
1815 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001817 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001818 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001819 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 size = PyUnicode_GET_LENGTH(obj);
1821 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1822 PyUnicode_CopyCharacters((PyObject*)string, i,
1823 obj, 0,
1824 size);
1825 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001826 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 size = PyUnicode_GET_LENGTH(*callresult);
1828 assert(PyUnicode_KIND(*callresult) <=
1829 PyUnicode_KIND(string));
1830 PyUnicode_CopyCharacters((PyObject*)string, i,
1831 *callresult,
1832 0, size);
1833 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001834 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001835 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001836 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001837 break;
1838 }
1839 case 'S':
1840 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001841 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001842 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001843 /* unused, since we already have the result */
1844 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
1846 PyUnicode_CopyCharacters((PyObject*)string, i,
1847 *callresult, 0,
1848 PyUnicode_GET_LENGTH(*callresult));
1849 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001850 /* We're done with the unicode()/repr() => forget it */
1851 Py_DECREF(*callresult);
1852 /* switch to next unicode()/repr() result */
1853 ++callresult;
1854 break;
1855 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001856 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001858 break;
1859 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860 for (; *p; ++p, ++i)
1861 PyUnicode_WRITE(kind, data, i, *p);
1862 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001863 goto end;
1864 }
Victor Stinner1205f272010-09-11 00:54:47 +00001865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 else {
1867 assert(i < PyUnicode_GET_LENGTH(string));
1868 PyUnicode_WRITE(kind, data, i++, *f);
1869 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001870 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001872
Benjamin Peterson29060642009-01-31 22:14:21 +00001873 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001874 if (callresults)
1875 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 if (numberresults)
1877 PyObject_Free(numberresults);
1878 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001879 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001880 if (callresults) {
1881 PyObject **callresult2 = callresults;
1882 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001883 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001884 ++callresult2;
1885 }
1886 PyObject_Free(callresults);
1887 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 if (numberresults)
1889 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001890 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001891}
1892
Walter Dörwaldd2034312007-05-18 16:29:38 +00001893PyObject *
1894PyUnicode_FromFormat(const char *format, ...)
1895{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001896 PyObject* ret;
1897 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001898
1899#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001900 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001901#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001902 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001903#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001904 ret = PyUnicode_FromFormatV(format, vargs);
1905 va_end(vargs);
1906 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001907}
1908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909#ifdef HAVE_WCHAR_H
1910
Victor Stinner5593d8a2010-10-02 11:11:27 +00001911/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1912 convert a Unicode object to a wide character string.
1913
Victor Stinnerd88d9832011-09-06 02:00:05 +02001914 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001915 character) required to convert the unicode object. Ignore size argument.
1916
Victor Stinnerd88d9832011-09-06 02:00:05 +02001917 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001918 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001919 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001920static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001921unicode_aswidechar(PyUnicodeObject *unicode,
1922 wchar_t *w,
1923 Py_ssize_t size)
1924{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001925 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 const wchar_t *wstr;
1927
1928 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1929 if (wstr == NULL)
1930 return -1;
1931
Victor Stinner5593d8a2010-10-02 11:11:27 +00001932 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001933 if (size > res)
1934 size = res + 1;
1935 else
1936 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001937 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001938 return res;
1939 }
1940 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001942}
1943
1944Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001945PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001946 wchar_t *w,
1947 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948{
1949 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001950 PyErr_BadInternalCall();
1951 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001953 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954}
1955
Victor Stinner137c34c2010-09-29 10:25:54 +00001956wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001957PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001958 Py_ssize_t *size)
1959{
1960 wchar_t* buffer;
1961 Py_ssize_t buflen;
1962
1963 if (unicode == NULL) {
1964 PyErr_BadInternalCall();
1965 return NULL;
1966 }
1967
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001968 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 if (buflen == -1)
1970 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001971 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001972 PyErr_NoMemory();
1973 return NULL;
1974 }
1975
Victor Stinner137c34c2010-09-29 10:25:54 +00001976 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1977 if (buffer == NULL) {
1978 PyErr_NoMemory();
1979 return NULL;
1980 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001981 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001982 if (buflen == -1)
1983 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001984 if (size != NULL)
1985 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001986 return buffer;
1987}
1988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990
Alexander Belopolsky40018472011-02-26 01:02:56 +00001991PyObject *
1992PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001993{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001995 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001996 PyErr_SetString(PyExc_ValueError,
1997 "chr() arg not in range(0x110000)");
1998 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001999 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 if (ordinal < 256)
2002 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 v = PyUnicode_New(1, ordinal);
2005 if (v == NULL)
2006 return NULL;
2007 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2008 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002009}
2010
Alexander Belopolsky40018472011-02-26 01:02:56 +00002011PyObject *
2012PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002014 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002015 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002016 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002017 Py_INCREF(obj);
2018 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002019 }
2020 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002021 /* For a Unicode subtype that's not a Unicode object,
2022 return a true Unicode object with the same data. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 if (PyUnicode_READY(obj) == -1)
2024 return NULL;
2025 return substring((PyUnicodeObject *)obj, 0, PyUnicode_GET_LENGTH(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002026 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002027 PyErr_Format(PyExc_TypeError,
2028 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002029 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002030 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002031}
2032
Alexander Belopolsky40018472011-02-26 01:02:56 +00002033PyObject *
2034PyUnicode_FromEncodedObject(register PyObject *obj,
2035 const char *encoding,
2036 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002037{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002038 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002039 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002040
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002042 PyErr_BadInternalCall();
2043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002045
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002046 /* Decoding bytes objects is the most common case and should be fast */
2047 if (PyBytes_Check(obj)) {
2048 if (PyBytes_GET_SIZE(obj) == 0) {
2049 Py_INCREF(unicode_empty);
2050 v = (PyObject *) unicode_empty;
2051 }
2052 else {
2053 v = PyUnicode_Decode(
2054 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2055 encoding, errors);
2056 }
2057 return v;
2058 }
2059
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002060 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002061 PyErr_SetString(PyExc_TypeError,
2062 "decoding str is not supported");
2063 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002064 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002065
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002066 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2067 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2068 PyErr_Format(PyExc_TypeError,
2069 "coercing to str: need bytes, bytearray "
2070 "or buffer-like object, %.80s found",
2071 Py_TYPE(obj)->tp_name);
2072 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002073 }
Tim Petersced69f82003-09-16 20:30:58 +00002074
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002075 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002076 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002077 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 }
Tim Petersced69f82003-09-16 20:30:58 +00002079 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002080 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002081
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002082 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002083 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084}
2085
Victor Stinner600d3be2010-06-10 12:00:55 +00002086/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002087 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2088 1 on success. */
2089static int
2090normalize_encoding(const char *encoding,
2091 char *lower,
2092 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002094 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002095 char *l;
2096 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002097
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002098 e = encoding;
2099 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002100 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002101 while (*e) {
2102 if (l == l_end)
2103 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002104 if (Py_ISUPPER(*e)) {
2105 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002106 }
2107 else if (*e == '_') {
2108 *l++ = '-';
2109 e++;
2110 }
2111 else {
2112 *l++ = *e++;
2113 }
2114 }
2115 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002116 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002117}
2118
Alexander Belopolsky40018472011-02-26 01:02:56 +00002119PyObject *
2120PyUnicode_Decode(const char *s,
2121 Py_ssize_t size,
2122 const char *encoding,
2123 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002124{
2125 PyObject *buffer = NULL, *unicode;
2126 Py_buffer info;
2127 char lower[11]; /* Enough for any encoding shortcut */
2128
2129 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002130 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002131
2132 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002133 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002134 if ((strcmp(lower, "utf-8") == 0) ||
2135 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002136 return PyUnicode_DecodeUTF8(s, size, errors);
2137 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002138 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002139 (strcmp(lower, "iso-8859-1") == 0))
2140 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002141#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002142 else if (strcmp(lower, "mbcs") == 0)
2143 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002144#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002145 else if (strcmp(lower, "ascii") == 0)
2146 return PyUnicode_DecodeASCII(s, size, errors);
2147 else if (strcmp(lower, "utf-16") == 0)
2148 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2149 else if (strcmp(lower, "utf-32") == 0)
2150 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152
2153 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002154 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002155 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002156 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002157 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 if (buffer == NULL)
2159 goto onError;
2160 unicode = PyCodec_Decode(buffer, encoding, errors);
2161 if (unicode == NULL)
2162 goto onError;
2163 if (!PyUnicode_Check(unicode)) {
2164 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002165 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002166 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 Py_DECREF(unicode);
2168 goto onError;
2169 }
2170 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 if (PyUnicode_READY(unicode)) {
2172 Py_DECREF(unicode);
2173 return NULL;
2174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002176
Benjamin Peterson29060642009-01-31 22:14:21 +00002177 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178 Py_XDECREF(buffer);
2179 return NULL;
2180}
2181
Alexander Belopolsky40018472011-02-26 01:02:56 +00002182PyObject *
2183PyUnicode_AsDecodedObject(PyObject *unicode,
2184 const char *encoding,
2185 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002186{
2187 PyObject *v;
2188
2189 if (!PyUnicode_Check(unicode)) {
2190 PyErr_BadArgument();
2191 goto onError;
2192 }
2193
2194 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002195 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002196
2197 /* Decode via the codec registry */
2198 v = PyCodec_Decode(unicode, encoding, errors);
2199 if (v == NULL)
2200 goto onError;
2201 return v;
2202
Benjamin Peterson29060642009-01-31 22:14:21 +00002203 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002204 return NULL;
2205}
2206
Alexander Belopolsky40018472011-02-26 01:02:56 +00002207PyObject *
2208PyUnicode_AsDecodedUnicode(PyObject *unicode,
2209 const char *encoding,
2210 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002211{
2212 PyObject *v;
2213
2214 if (!PyUnicode_Check(unicode)) {
2215 PyErr_BadArgument();
2216 goto onError;
2217 }
2218
2219 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002220 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002221
2222 /* Decode via the codec registry */
2223 v = PyCodec_Decode(unicode, encoding, errors);
2224 if (v == NULL)
2225 goto onError;
2226 if (!PyUnicode_Check(v)) {
2227 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002228 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002229 Py_TYPE(v)->tp_name);
2230 Py_DECREF(v);
2231 goto onError;
2232 }
2233 return v;
2234
Benjamin Peterson29060642009-01-31 22:14:21 +00002235 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002236 return NULL;
2237}
2238
Alexander Belopolsky40018472011-02-26 01:02:56 +00002239PyObject *
2240PyUnicode_Encode(const Py_UNICODE *s,
2241 Py_ssize_t size,
2242 const char *encoding,
2243 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244{
2245 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002246
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 unicode = PyUnicode_FromUnicode(s, size);
2248 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2251 Py_DECREF(unicode);
2252 return v;
2253}
2254
Alexander Belopolsky40018472011-02-26 01:02:56 +00002255PyObject *
2256PyUnicode_AsEncodedObject(PyObject *unicode,
2257 const char *encoding,
2258 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002259{
2260 PyObject *v;
2261
2262 if (!PyUnicode_Check(unicode)) {
2263 PyErr_BadArgument();
2264 goto onError;
2265 }
2266
2267 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002268 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002269
2270 /* Encode via the codec registry */
2271 v = PyCodec_Encode(unicode, encoding, errors);
2272 if (v == NULL)
2273 goto onError;
2274 return v;
2275
Benjamin Peterson29060642009-01-31 22:14:21 +00002276 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002277 return NULL;
2278}
2279
Victor Stinnerad158722010-10-27 00:25:46 +00002280PyObject *
2281PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002282{
Victor Stinner99b95382011-07-04 14:23:54 +02002283#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002284 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2285 PyUnicode_GET_SIZE(unicode),
2286 NULL);
2287#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002289#else
Victor Stinner793b5312011-04-27 00:24:21 +02002290 PyInterpreterState *interp = PyThreadState_GET()->interp;
2291 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2292 cannot use it to encode and decode filenames before it is loaded. Load
2293 the Python codec requires to encode at least its own filename. Use the C
2294 version of the locale codec until the codec registry is initialized and
2295 the Python codec is loaded.
2296
2297 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2298 cannot only rely on it: check also interp->fscodec_initialized for
2299 subinterpreters. */
2300 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002301 return PyUnicode_AsEncodedString(unicode,
2302 Py_FileSystemDefaultEncoding,
2303 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002304 }
2305 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002306 /* locale encoding with surrogateescape */
2307 wchar_t *wchar;
2308 char *bytes;
2309 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002310 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002311
2312 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2313 if (wchar == NULL)
2314 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002315 bytes = _Py_wchar2char(wchar, &error_pos);
2316 if (bytes == NULL) {
2317 if (error_pos != (size_t)-1) {
2318 char *errmsg = strerror(errno);
2319 PyObject *exc = NULL;
2320 if (errmsg == NULL)
2321 errmsg = "Py_wchar2char() failed";
2322 raise_encode_exception(&exc,
2323 "filesystemencoding",
2324 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2325 error_pos, error_pos+1,
2326 errmsg);
2327 Py_XDECREF(exc);
2328 }
2329 else
2330 PyErr_NoMemory();
2331 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002332 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002333 }
2334 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002335
2336 bytes_obj = PyBytes_FromString(bytes);
2337 PyMem_Free(bytes);
2338 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002339 }
Victor Stinnerad158722010-10-27 00:25:46 +00002340#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002341}
2342
Alexander Belopolsky40018472011-02-26 01:02:56 +00002343PyObject *
2344PyUnicode_AsEncodedString(PyObject *unicode,
2345 const char *encoding,
2346 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002347{
2348 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002349 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002350
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 if (!PyUnicode_Check(unicode)) {
2352 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002353 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354 }
Fred Drakee4315f52000-05-09 19:53:39 +00002355
Victor Stinner2f283c22011-03-02 01:21:46 +00002356 if (encoding == NULL) {
2357 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002358 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002359 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002361 }
Fred Drakee4315f52000-05-09 19:53:39 +00002362
2363 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002364 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002365 if ((strcmp(lower, "utf-8") == 0) ||
2366 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002367 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002368 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002370 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002372 }
Victor Stinner37296e82010-06-10 13:36:23 +00002373 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002374 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002375 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002377#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002378 else if (strcmp(lower, "mbcs") == 0)
2379 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2380 PyUnicode_GET_SIZE(unicode),
2381 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002382#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002383 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002385 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386
2387 /* Encode via the codec registry */
2388 v = PyCodec_Encode(unicode, encoding, errors);
2389 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002390 return NULL;
2391
2392 /* The normal path */
2393 if (PyBytes_Check(v))
2394 return v;
2395
2396 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002397 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002398 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002399 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002400
2401 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2402 "encoder %s returned bytearray instead of bytes",
2403 encoding);
2404 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002405 Py_DECREF(v);
2406 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002407 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002408
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002409 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2410 Py_DECREF(v);
2411 return b;
2412 }
2413
2414 PyErr_Format(PyExc_TypeError,
2415 "encoder did not return a bytes object (type=%.400s)",
2416 Py_TYPE(v)->tp_name);
2417 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002418 return NULL;
2419}
2420
Alexander Belopolsky40018472011-02-26 01:02:56 +00002421PyObject *
2422PyUnicode_AsEncodedUnicode(PyObject *unicode,
2423 const char *encoding,
2424 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002425{
2426 PyObject *v;
2427
2428 if (!PyUnicode_Check(unicode)) {
2429 PyErr_BadArgument();
2430 goto onError;
2431 }
2432
2433 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002434 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002435
2436 /* Encode via the codec registry */
2437 v = PyCodec_Encode(unicode, encoding, errors);
2438 if (v == NULL)
2439 goto onError;
2440 if (!PyUnicode_Check(v)) {
2441 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002442 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002443 Py_TYPE(v)->tp_name);
2444 Py_DECREF(v);
2445 goto onError;
2446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002448
Benjamin Peterson29060642009-01-31 22:14:21 +00002449 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002450 return NULL;
2451}
2452
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002453PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002454PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002455 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002456 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2457}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002458
Christian Heimes5894ba72007-11-04 11:43:14 +00002459PyObject*
2460PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2461{
Victor Stinner99b95382011-07-04 14:23:54 +02002462#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002463 return PyUnicode_DecodeMBCS(s, size, NULL);
2464#elif defined(__APPLE__)
2465 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2466#else
Victor Stinner793b5312011-04-27 00:24:21 +02002467 PyInterpreterState *interp = PyThreadState_GET()->interp;
2468 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2469 cannot use it to encode and decode filenames before it is loaded. Load
2470 the Python codec requires to encode at least its own filename. Use the C
2471 version of the locale codec until the codec registry is initialized and
2472 the Python codec is loaded.
2473
2474 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2475 cannot only rely on it: check also interp->fscodec_initialized for
2476 subinterpreters. */
2477 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002478 return PyUnicode_Decode(s, size,
2479 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002480 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002481 }
2482 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002483 /* locale encoding with surrogateescape */
2484 wchar_t *wchar;
2485 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002486 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002487
2488 if (s[size] != '\0' || size != strlen(s)) {
2489 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2490 return NULL;
2491 }
2492
Victor Stinner168e1172010-10-16 23:16:16 +00002493 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002494 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002495 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002496
Victor Stinner168e1172010-10-16 23:16:16 +00002497 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002498 PyMem_Free(wchar);
2499 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002500 }
Victor Stinnerad158722010-10-27 00:25:46 +00002501#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002502}
2503
Martin v. Löwis011e8422009-05-05 04:43:17 +00002504
2505int
2506PyUnicode_FSConverter(PyObject* arg, void* addr)
2507{
2508 PyObject *output = NULL;
2509 Py_ssize_t size;
2510 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002511 if (arg == NULL) {
2512 Py_DECREF(*(PyObject**)addr);
2513 return 1;
2514 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002515 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002516 output = arg;
2517 Py_INCREF(output);
2518 }
2519 else {
2520 arg = PyUnicode_FromObject(arg);
2521 if (!arg)
2522 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002523 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002524 Py_DECREF(arg);
2525 if (!output)
2526 return 0;
2527 if (!PyBytes_Check(output)) {
2528 Py_DECREF(output);
2529 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2530 return 0;
2531 }
2532 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002533 size = PyBytes_GET_SIZE(output);
2534 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002535 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002536 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002537 Py_DECREF(output);
2538 return 0;
2539 }
2540 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002541 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002542}
2543
2544
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002545int
2546PyUnicode_FSDecoder(PyObject* arg, void* addr)
2547{
2548 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002549 if (arg == NULL) {
2550 Py_DECREF(*(PyObject**)addr);
2551 return 1;
2552 }
2553 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 if (PyUnicode_READY(arg))
2555 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002556 output = arg;
2557 Py_INCREF(output);
2558 }
2559 else {
2560 arg = PyBytes_FromObject(arg);
2561 if (!arg)
2562 return 0;
2563 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2564 PyBytes_GET_SIZE(arg));
2565 Py_DECREF(arg);
2566 if (!output)
2567 return 0;
2568 if (!PyUnicode_Check(output)) {
2569 Py_DECREF(output);
2570 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2571 return 0;
2572 }
2573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002574 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2575 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002576 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2577 Py_DECREF(output);
2578 return 0;
2579 }
2580 *(PyObject**)addr = output;
2581 return Py_CLEANUP_SUPPORTED;
2582}
2583
2584
Martin v. Löwis5b222132007-06-10 09:51:05 +00002585char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002587{
Christian Heimesf3863112007-11-22 07:46:41 +00002588 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2590
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002591 if (!PyUnicode_Check(unicode)) {
2592 PyErr_BadArgument();
2593 return NULL;
2594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002596 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597
2598 if (_PyUnicode_UTF8(unicode) == NULL) {
2599 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2600 if (bytes == NULL)
2601 return NULL;
2602 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2603 if (u->_base.utf8 == NULL) {
2604 Py_DECREF(bytes);
2605 return NULL;
2606 }
2607 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2608 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2609 Py_DECREF(bytes);
2610 }
2611
2612 if (psize)
2613 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2614 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002615}
2616
2617char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2621}
2622
2623#ifdef Py_DEBUG
2624int unicode_as_unicode_calls = 0;
2625#endif
2626
2627
2628Py_UNICODE *
2629PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2630{
2631 PyUnicodeObject *u;
2632 const unsigned char *one_byte;
2633#if SIZEOF_WCHAR_T == 4
2634 const Py_UCS2 *two_bytes;
2635#else
2636 const Py_UCS4 *four_bytes;
2637 const Py_UCS4 *ucs4_end;
2638 Py_ssize_t num_surrogates;
2639#endif
2640 wchar_t *w;
2641 wchar_t *wchar_end;
2642
2643 if (!PyUnicode_Check(unicode)) {
2644 PyErr_BadArgument();
2645 return NULL;
2646 }
2647 u = (PyUnicodeObject*)unicode;
2648 if (_PyUnicode_WSTR(u) == NULL) {
2649 /* Non-ASCII compact unicode object */
2650 assert(_PyUnicode_KIND(u) != 0);
2651 assert(PyUnicode_IS_READY(u));
2652
2653#ifdef Py_DEBUG
2654 ++unicode_as_unicode_calls;
2655#endif
2656
2657 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2658#if SIZEOF_WCHAR_T == 2
2659 four_bytes = PyUnicode_4BYTE_DATA(u);
2660 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2661 num_surrogates = 0;
2662
2663 for (; four_bytes < ucs4_end; ++four_bytes) {
2664 if (*four_bytes > 0xFFFF)
2665 ++num_surrogates;
2666 }
2667
2668 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2669 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2670 if (!_PyUnicode_WSTR(u)) {
2671 PyErr_NoMemory();
2672 return NULL;
2673 }
2674 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2675
2676 w = _PyUnicode_WSTR(u);
2677 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2678 four_bytes = PyUnicode_4BYTE_DATA(u);
2679 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2680 if (*four_bytes > 0xFFFF) {
2681 /* encode surrogate pair in this case */
2682 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2683 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2684 }
2685 else
2686 *w = *four_bytes;
2687
2688 if (w > wchar_end) {
2689 assert(0 && "Miscalculated string end");
2690 }
2691 }
2692 *w = 0;
2693#else
2694 /* sizeof(wchar_t) == 4 */
2695 Py_FatalError("Impossible unicode object state, wstr and str "
2696 "should share memory already.");
2697 return NULL;
2698#endif
2699 }
2700 else {
2701 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2702 (_PyUnicode_LENGTH(u) + 1));
2703 if (!_PyUnicode_WSTR(u)) {
2704 PyErr_NoMemory();
2705 return NULL;
2706 }
2707 if (!PyUnicode_IS_COMPACT_ASCII(u))
2708 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2709 w = _PyUnicode_WSTR(u);
2710 wchar_end = w + _PyUnicode_LENGTH(u);
2711
2712 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2713 one_byte = PyUnicode_1BYTE_DATA(u);
2714 for (; w < wchar_end; ++one_byte, ++w)
2715 *w = *one_byte;
2716 /* null-terminate the wstr */
2717 *w = 0;
2718 }
2719 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2720#if SIZEOF_WCHAR_T == 4
2721 two_bytes = PyUnicode_2BYTE_DATA(u);
2722 for (; w < wchar_end; ++two_bytes, ++w)
2723 *w = *two_bytes;
2724 /* null-terminate the wstr */
2725 *w = 0;
2726#else
2727 /* sizeof(wchar_t) == 2 */
2728 PyObject_FREE(_PyUnicode_WSTR(u));
2729 _PyUnicode_WSTR(u) = NULL;
2730 Py_FatalError("Impossible unicode object state, wstr "
2731 "and str should share memory already.");
2732 return NULL;
2733#endif
2734 }
2735 else {
2736 assert(0 && "This should never happen.");
2737 }
2738 }
2739 }
2740 if (size != NULL)
2741 *size = PyUnicode_WSTR_LENGTH(u);
2742 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002743}
2744
Alexander Belopolsky40018472011-02-26 01:02:56 +00002745Py_UNICODE *
2746PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749}
2750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751
Alexander Belopolsky40018472011-02-26 01:02:56 +00002752Py_ssize_t
2753PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754{
2755 if (!PyUnicode_Check(unicode)) {
2756 PyErr_BadArgument();
2757 goto onError;
2758 }
2759 return PyUnicode_GET_SIZE(unicode);
2760
Benjamin Peterson29060642009-01-31 22:14:21 +00002761 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 return -1;
2763}
2764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002765Py_ssize_t
2766PyUnicode_GetLength(PyObject *unicode)
2767{
2768 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2769 PyErr_BadArgument();
2770 return -1;
2771 }
2772
2773 return PyUnicode_GET_LENGTH(unicode);
2774}
2775
2776Py_UCS4
2777PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2778{
2779 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2780 return PyErr_BadArgument();
2781 return (Py_UCS4)-1;
2782 }
2783 return PyUnicode_READ_CHAR(unicode, index);
2784}
2785
2786int
2787PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2788{
2789 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2790 return PyErr_BadArgument();
2791 return -1;
2792 }
2793
2794 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2795 index, ch);
2796 return 0;
2797}
2798
Alexander Belopolsky40018472011-02-26 01:02:56 +00002799const char *
2800PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002801{
Victor Stinner42cb4622010-09-01 19:39:01 +00002802 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002803}
2804
Victor Stinner554f3f02010-06-16 23:33:54 +00002805/* create or adjust a UnicodeDecodeError */
2806static void
2807make_decode_exception(PyObject **exceptionObject,
2808 const char *encoding,
2809 const char *input, Py_ssize_t length,
2810 Py_ssize_t startpos, Py_ssize_t endpos,
2811 const char *reason)
2812{
2813 if (*exceptionObject == NULL) {
2814 *exceptionObject = PyUnicodeDecodeError_Create(
2815 encoding, input, length, startpos, endpos, reason);
2816 }
2817 else {
2818 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2819 goto onError;
2820 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2821 goto onError;
2822 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2823 goto onError;
2824 }
2825 return;
2826
2827onError:
2828 Py_DECREF(*exceptionObject);
2829 *exceptionObject = NULL;
2830}
2831
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002832/* error handling callback helper:
2833 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002834 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 and adjust various state variables.
2836 return 0 on success, -1 on error
2837*/
2838
Alexander Belopolsky40018472011-02-26 01:02:56 +00002839static int
2840unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2841 const char *encoding, const char *reason,
2842 const char **input, const char **inend, Py_ssize_t *startinpos,
2843 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2844 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002845{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002846 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002847
2848 PyObject *restuple = NULL;
2849 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002850 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002851 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002852 Py_ssize_t requiredsize;
2853 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002854 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002855 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002856 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 int res = -1;
2858
2859 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002860 *errorHandler = PyCodec_LookupError(errors);
2861 if (*errorHandler == NULL)
2862 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002863 }
2864
Victor Stinner554f3f02010-06-16 23:33:54 +00002865 make_decode_exception(exceptionObject,
2866 encoding,
2867 *input, *inend - *input,
2868 *startinpos, *endinpos,
2869 reason);
2870 if (*exceptionObject == NULL)
2871 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002872
2873 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2874 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002875 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002876 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002877 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002878 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 }
2880 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002882
2883 /* Copy back the bytes variables, which might have been modified by the
2884 callback */
2885 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2886 if (!inputobj)
2887 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002888 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002889 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002890 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002891 *input = PyBytes_AS_STRING(inputobj);
2892 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002893 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002894 /* we can DECREF safely, as the exception has another reference,
2895 so the object won't go away. */
2896 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002897
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002898 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002899 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002900 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002901 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2902 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002903 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002904
2905 /* need more space? (at least enough for what we
2906 have+the replacement+the rest of the string (starting
2907 at the new input position), so we won't have to check space
2908 when there are no errors in the rest of the string) */
2909 repptr = PyUnicode_AS_UNICODE(repunicode);
2910 repsize = PyUnicode_GET_SIZE(repunicode);
2911 requiredsize = *outpos + repsize + insize-newpos;
2912 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002913 if (requiredsize<2*outsize)
2914 requiredsize = 2*outsize;
2915 if (_PyUnicode_Resize(output, requiredsize) < 0)
2916 goto onError;
2917 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002918 }
2919 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002920 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002921 Py_UNICODE_COPY(*outptr, repptr, repsize);
2922 *outptr += repsize;
2923 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002924
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925 /* we made it! */
2926 res = 0;
2927
Benjamin Peterson29060642009-01-31 22:14:21 +00002928 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002929 Py_XDECREF(restuple);
2930 return res;
2931}
2932
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002933/* --- UTF-7 Codec -------------------------------------------------------- */
2934
Antoine Pitrou244651a2009-05-04 18:56:13 +00002935/* See RFC2152 for details. We encode conservatively and decode liberally. */
2936
2937/* Three simple macros defining base-64. */
2938
2939/* Is c a base-64 character? */
2940
2941#define IS_BASE64(c) \
2942 (((c) >= 'A' && (c) <= 'Z') || \
2943 ((c) >= 'a' && (c) <= 'z') || \
2944 ((c) >= '0' && (c) <= '9') || \
2945 (c) == '+' || (c) == '/')
2946
2947/* given that c is a base-64 character, what is its base-64 value? */
2948
2949#define FROM_BASE64(c) \
2950 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2951 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2952 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2953 (c) == '+' ? 62 : 63)
2954
2955/* What is the base-64 character of the bottom 6 bits of n? */
2956
2957#define TO_BASE64(n) \
2958 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2959
2960/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2961 * decoded as itself. We are permissive on decoding; the only ASCII
2962 * byte not decoding to itself is the + which begins a base64
2963 * string. */
2964
2965#define DECODE_DIRECT(c) \
2966 ((c) <= 127 && (c) != '+')
2967
2968/* The UTF-7 encoder treats ASCII characters differently according to
2969 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2970 * the above). See RFC2152. This array identifies these different
2971 * sets:
2972 * 0 : "Set D"
2973 * alphanumeric and '(),-./:?
2974 * 1 : "Set O"
2975 * !"#$%&*;<=>@[]^_`{|}
2976 * 2 : "whitespace"
2977 * ht nl cr sp
2978 * 3 : special (must be base64 encoded)
2979 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2980 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002981
Tim Petersced69f82003-09-16 20:30:58 +00002982static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002983char utf7_category[128] = {
2984/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2985 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2986/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2987 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2988/* sp ! " # $ % & ' ( ) * + , - . / */
2989 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2990/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2991 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2992/* @ A B C D E F G H I J K L M N O */
2993 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2994/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2995 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2996/* ` a b c d e f g h i j k l m n o */
2997 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2998/* p q r s t u v w x y z { | } ~ del */
2999 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003000};
3001
Antoine Pitrou244651a2009-05-04 18:56:13 +00003002/* ENCODE_DIRECT: this character should be encoded as itself. The
3003 * answer depends on whether we are encoding set O as itself, and also
3004 * on whether we are encoding whitespace as itself. RFC2152 makes it
3005 * clear that the answers to these questions vary between
3006 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003007
Antoine Pitrou244651a2009-05-04 18:56:13 +00003008#define ENCODE_DIRECT(c, directO, directWS) \
3009 ((c) < 128 && (c) > 0 && \
3010 ((utf7_category[(c)] == 0) || \
3011 (directWS && (utf7_category[(c)] == 2)) || \
3012 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003013
Alexander Belopolsky40018472011-02-26 01:02:56 +00003014PyObject *
3015PyUnicode_DecodeUTF7(const char *s,
3016 Py_ssize_t size,
3017 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003018{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003019 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3020}
3021
Antoine Pitrou244651a2009-05-04 18:56:13 +00003022/* The decoder. The only state we preserve is our read position,
3023 * i.e. how many characters we have consumed. So if we end in the
3024 * middle of a shift sequence we have to back off the read position
3025 * and the output to the beginning of the sequence, otherwise we lose
3026 * all the shift state (seen bits, number of bits seen, high
3027 * surrogate). */
3028
Alexander Belopolsky40018472011-02-26 01:02:56 +00003029PyObject *
3030PyUnicode_DecodeUTF7Stateful(const char *s,
3031 Py_ssize_t size,
3032 const char *errors,
3033 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003034{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003035 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003036 Py_ssize_t startinpos;
3037 Py_ssize_t endinpos;
3038 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003039 const char *e;
3040 PyUnicodeObject *unicode;
3041 Py_UNICODE *p;
3042 const char *errmsg = "";
3043 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003044 Py_UNICODE *shiftOutStart;
3045 unsigned int base64bits = 0;
3046 unsigned long base64buffer = 0;
3047 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 PyObject *errorHandler = NULL;
3049 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003050
3051 unicode = _PyUnicode_New(size);
3052 if (!unicode)
3053 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003054 if (size == 0) {
3055 if (consumed)
3056 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003057 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003058 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003060 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003061 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003062 e = s + size;
3063
3064 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003065 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003066 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003067 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003068
Antoine Pitrou244651a2009-05-04 18:56:13 +00003069 if (inShift) { /* in a base-64 section */
3070 if (IS_BASE64(ch)) { /* consume a base-64 character */
3071 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3072 base64bits += 6;
3073 s++;
3074 if (base64bits >= 16) {
3075 /* we have enough bits for a UTF-16 value */
3076 Py_UNICODE outCh = (Py_UNICODE)
3077 (base64buffer >> (base64bits-16));
3078 base64bits -= 16;
3079 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3080 if (surrogate) {
3081 /* expecting a second surrogate */
3082 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3083#ifdef Py_UNICODE_WIDE
3084 *p++ = (((surrogate & 0x3FF)<<10)
3085 | (outCh & 0x3FF)) + 0x10000;
3086#else
3087 *p++ = surrogate;
3088 *p++ = outCh;
3089#endif
3090 surrogate = 0;
3091 }
3092 else {
3093 surrogate = 0;
3094 errmsg = "second surrogate missing";
3095 goto utf7Error;
3096 }
3097 }
3098 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3099 /* first surrogate */
3100 surrogate = outCh;
3101 }
3102 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3103 errmsg = "unexpected second surrogate";
3104 goto utf7Error;
3105 }
3106 else {
3107 *p++ = outCh;
3108 }
3109 }
3110 }
3111 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003112 inShift = 0;
3113 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003114 if (surrogate) {
3115 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003116 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003117 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003118 if (base64bits > 0) { /* left-over bits */
3119 if (base64bits >= 6) {
3120 /* We've seen at least one base-64 character */
3121 errmsg = "partial character in shift sequence";
3122 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003123 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003124 else {
3125 /* Some bits remain; they should be zero */
3126 if (base64buffer != 0) {
3127 errmsg = "non-zero padding bits in shift sequence";
3128 goto utf7Error;
3129 }
3130 }
3131 }
3132 if (ch != '-') {
3133 /* '-' is absorbed; other terminating
3134 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003135 *p++ = ch;
3136 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003137 }
3138 }
3139 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003140 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003141 s++; /* consume '+' */
3142 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003143 s++;
3144 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003145 }
3146 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003147 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003148 shiftOutStart = p;
3149 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003150 }
3151 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003152 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003153 *p++ = ch;
3154 s++;
3155 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003156 else {
3157 startinpos = s-starts;
3158 s++;
3159 errmsg = "unexpected special character";
3160 goto utf7Error;
3161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003162 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003163utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164 outpos = p-PyUnicode_AS_UNICODE(unicode);
3165 endinpos = s-starts;
3166 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 errors, &errorHandler,
3168 "utf7", errmsg,
3169 &starts, &e, &startinpos, &endinpos, &exc, &s,
3170 &unicode, &outpos, &p))
3171 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003172 }
3173
Antoine Pitrou244651a2009-05-04 18:56:13 +00003174 /* end of string */
3175
3176 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3177 /* if we're in an inconsistent state, that's an error */
3178 if (surrogate ||
3179 (base64bits >= 6) ||
3180 (base64bits > 0 && base64buffer != 0)) {
3181 outpos = p-PyUnicode_AS_UNICODE(unicode);
3182 endinpos = size;
3183 if (unicode_decode_call_errorhandler(
3184 errors, &errorHandler,
3185 "utf7", "unterminated shift sequence",
3186 &starts, &e, &startinpos, &endinpos, &exc, &s,
3187 &unicode, &outpos, &p))
3188 goto onError;
3189 if (s < e)
3190 goto restart;
3191 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003192 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003193
3194 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003195 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003196 if (inShift) {
3197 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003198 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003199 }
3200 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003201 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003202 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003203 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003204
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003205 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003206 goto onError;
3207
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 Py_XDECREF(errorHandler);
3209 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003210 if (PyUnicode_READY(unicode) == -1) {
3211 Py_DECREF(unicode);
3212 return NULL;
3213 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003214 return (PyObject *)unicode;
3215
Benjamin Peterson29060642009-01-31 22:14:21 +00003216 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003217 Py_XDECREF(errorHandler);
3218 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003219 Py_DECREF(unicode);
3220 return NULL;
3221}
3222
3223
Alexander Belopolsky40018472011-02-26 01:02:56 +00003224PyObject *
3225PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3226 Py_ssize_t size,
3227 int base64SetO,
3228 int base64WhiteSpace,
3229 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003230{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003231 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003232 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003233 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003234 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003235 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003236 unsigned int base64bits = 0;
3237 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003238 char * out;
3239 char * start;
3240
3241 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003243
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003244 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003245 return PyErr_NoMemory();
3246
Antoine Pitrou244651a2009-05-04 18:56:13 +00003247 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003248 if (v == NULL)
3249 return NULL;
3250
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003251 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003252 for (;i < size; ++i) {
3253 Py_UNICODE ch = s[i];
3254
Antoine Pitrou244651a2009-05-04 18:56:13 +00003255 if (inShift) {
3256 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3257 /* shifting out */
3258 if (base64bits) { /* output remaining bits */
3259 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3260 base64buffer = 0;
3261 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003262 }
3263 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003264 /* Characters not in the BASE64 set implicitly unshift the sequence
3265 so no '-' is required, except if the character is itself a '-' */
3266 if (IS_BASE64(ch) || ch == '-') {
3267 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003268 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003269 *out++ = (char) ch;
3270 }
3271 else {
3272 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003273 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003274 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003275 else { /* not in a shift sequence */
3276 if (ch == '+') {
3277 *out++ = '+';
3278 *out++ = '-';
3279 }
3280 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3281 *out++ = (char) ch;
3282 }
3283 else {
3284 *out++ = '+';
3285 inShift = 1;
3286 goto encode_char;
3287 }
3288 }
3289 continue;
3290encode_char:
3291#ifdef Py_UNICODE_WIDE
3292 if (ch >= 0x10000) {
3293 /* code first surrogate */
3294 base64bits += 16;
3295 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3296 while (base64bits >= 6) {
3297 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3298 base64bits -= 6;
3299 }
3300 /* prepare second surrogate */
3301 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3302 }
3303#endif
3304 base64bits += 16;
3305 base64buffer = (base64buffer << 16) | ch;
3306 while (base64bits >= 6) {
3307 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3308 base64bits -= 6;
3309 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003310 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003311 if (base64bits)
3312 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3313 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003314 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003315 if (_PyBytes_Resize(&v, out - start) < 0)
3316 return NULL;
3317 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003318}
3319
Antoine Pitrou244651a2009-05-04 18:56:13 +00003320#undef IS_BASE64
3321#undef FROM_BASE64
3322#undef TO_BASE64
3323#undef DECODE_DIRECT
3324#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003325
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326/* --- UTF-8 Codec -------------------------------------------------------- */
3327
Tim Petersced69f82003-09-16 20:30:58 +00003328static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003330 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3331 illegal prefix. See RFC 3629 for details */
3332 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3333 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003334 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3336 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3337 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3338 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003339 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3340 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3342 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003343 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3344 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3345 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3346 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3347 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348};
3349
Alexander Belopolsky40018472011-02-26 01:02:56 +00003350PyObject *
3351PyUnicode_DecodeUTF8(const char *s,
3352 Py_ssize_t size,
3353 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354{
Walter Dörwald69652032004-09-07 20:24:22 +00003355 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3356}
3357
Antoine Pitrouab868312009-01-10 15:40:25 +00003358/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3359#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3360
3361/* Mask to quickly check whether a C 'long' contains a
3362 non-ASCII, UTF8-encoded char. */
3363#if (SIZEOF_LONG == 8)
3364# define ASCII_CHAR_MASK 0x8080808080808080L
3365#elif (SIZEOF_LONG == 4)
3366# define ASCII_CHAR_MASK 0x80808080L
3367#else
3368# error C 'long' size should be either 4 or 8!
3369#endif
3370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003371/* Scans a UTF-8 string and returns the maximum character to be expected,
3372 the size of the decoded unicode string and if any major errors were
3373 encountered.
3374
3375 This function does check basic UTF-8 sanity, it does however NOT CHECK
3376 if the string contains surrogates, and if all continuation bytes are
3377 within the correct ranges, these checks are performed in
3378 PyUnicode_DecodeUTF8Stateful.
3379
3380 If it sets has_errors to 1, it means the value of unicode_size and max_char
3381 will be bogus and you should not rely on useful information in them.
3382 */
3383static Py_UCS4
3384utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3385 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3386 int *has_errors)
3387{
3388 Py_ssize_t n;
3389 Py_ssize_t char_count = 0;
3390 Py_UCS4 max_char = 127, new_max;
3391 Py_UCS4 upper_bound;
3392 const unsigned char *p = (const unsigned char *)s;
3393 const unsigned char *end = p + string_size;
3394 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3395 int err = 0;
3396
3397 for (; p < end && !err; ++p, ++char_count) {
3398 /* Only check value if it's not a ASCII char... */
3399 if (*p < 0x80) {
3400 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3401 an explanation. */
3402 if (!((size_t) p & LONG_PTR_MASK)) {
3403 /* Help register allocation */
3404 register const unsigned char *_p = p;
3405 while (_p < aligned_end) {
3406 unsigned long value = *(unsigned long *) _p;
3407 if (value & ASCII_CHAR_MASK)
3408 break;
3409 _p += SIZEOF_LONG;
3410 char_count += SIZEOF_LONG;
3411 }
3412 p = _p;
3413 if (p == end)
3414 break;
3415 }
3416 }
3417 if (*p >= 0x80) {
3418 n = utf8_code_length[*p];
3419 new_max = max_char;
3420 switch (n) {
3421 /* invalid start byte */
3422 case 0:
3423 err = 1;
3424 break;
3425 case 2:
3426 /* Code points between 0x00FF and 0x07FF inclusive.
3427 Approximate the upper bound of the code point,
3428 if this flips over 255 we can be sure it will be more
3429 than 255 and the string will need 2 bytes per code coint,
3430 if it stays under or equal to 255, we can be sure 1 byte
3431 is enough.
3432 ((*p & 0b00011111) << 6) | 0b00111111 */
3433 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3434 if (max_char < upper_bound)
3435 new_max = upper_bound;
3436 /* Ensure we track at least that we left ASCII space. */
3437 if (new_max < 128)
3438 new_max = 128;
3439 break;
3440 case 3:
3441 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3442 always > 255 and <= 65535 and will always need 2 bytes. */
3443 if (max_char < 65535)
3444 new_max = 65535;
3445 break;
3446 case 4:
3447 /* Code point will be above 0xFFFF for sure in this case. */
3448 new_max = 65537;
3449 break;
3450 /* Internal error, this should be caught by the first if */
3451 case 1:
3452 default:
3453 assert(0 && "Impossible case in utf8_max_char_and_size");
3454 err = 1;
3455 }
3456 /* Instead of number of overall bytes for this code point,
3457 n containts the number of following bytes: */
3458 --n;
3459 /* Check if the follow up chars are all valid continuation bytes */
3460 if (n >= 1) {
3461 const unsigned char *cont;
3462 if ((p + n) >= end) {
3463 if (consumed == 0)
3464 /* incomplete data, non-incremental decoding */
3465 err = 1;
3466 break;
3467 }
3468 for (cont = p + 1; cont < (p + n); ++cont) {
3469 if ((*cont & 0xc0) != 0x80) {
3470 err = 1;
3471 break;
3472 }
3473 }
3474 p += n;
3475 }
3476 else
3477 err = 1;
3478 max_char = new_max;
3479 }
3480 }
3481
3482 if (unicode_size)
3483 *unicode_size = char_count;
3484 if (has_errors)
3485 *has_errors = err;
3486 return max_char;
3487}
3488
3489/* Similar to PyUnicode_WRITE but can also write into wstr field
3490 of the legacy unicode representation */
3491#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3492 do { \
3493 const int k_ = (kind); \
3494 if (k_ == PyUnicode_WCHAR_KIND) \
3495 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3496 else if (k_ == PyUnicode_1BYTE_KIND) \
3497 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3498 else if (k_ == PyUnicode_2BYTE_KIND) \
3499 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3500 else \
3501 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3502 } while (0)
3503
Alexander Belopolsky40018472011-02-26 01:02:56 +00003504PyObject *
3505PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003506 Py_ssize_t size,
3507 const char *errors,
3508 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003509{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003512 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003513 Py_ssize_t startinpos;
3514 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003515 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003517 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 PyObject *errorHandler = NULL;
3519 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003520 Py_UCS4 maxchar = 0;
3521 Py_ssize_t unicode_size;
3522 Py_ssize_t i;
3523 int kind;
3524 void *data;
3525 int has_errors;
3526 Py_UNICODE *error_outptr;
3527#if SIZEOF_WCHAR_T == 2
3528 Py_ssize_t wchar_offset = 0;
3529#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530
Walter Dörwald69652032004-09-07 20:24:22 +00003531 if (size == 0) {
3532 if (consumed)
3533 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003534 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003536 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3537 consumed, &has_errors);
3538 if (has_errors) {
3539 unicode = _PyUnicode_New(size);
3540 if (!unicode)
3541 return NULL;
3542 kind = PyUnicode_WCHAR_KIND;
3543 data = PyUnicode_AS_UNICODE(unicode);
3544 assert(data != NULL);
3545 }
3546 else {
3547 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3548 if (!unicode)
3549 return NULL;
3550 /* When the string is ASCII only, just use memcpy and return.
3551 unicode_size may be != size if there is an incomplete UTF-8
3552 sequence at the end of the ASCII block. */
3553 if (maxchar < 128 && size == unicode_size) {
3554 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3555 return (PyObject *)unicode;
3556 }
3557 kind = PyUnicode_KIND(unicode);
3558 data = PyUnicode_DATA(unicode);
3559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003561 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003563 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564
3565 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003566 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567
3568 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003569 /* Fast path for runs of ASCII characters. Given that common UTF-8
3570 input will consist of an overwhelming majority of ASCII
3571 characters, we try to optimize for this case by checking
3572 as many characters as a C 'long' can contain.
3573 First, check if we can do an aligned read, as most CPUs have
3574 a penalty for unaligned reads.
3575 */
3576 if (!((size_t) s & LONG_PTR_MASK)) {
3577 /* Help register allocation */
3578 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003579 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003580 while (_s < aligned_end) {
3581 /* Read a whole long at a time (either 4 or 8 bytes),
3582 and do a fast unrolled copy if it only contains ASCII
3583 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003584 unsigned long value = *(unsigned long *) _s;
3585 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003586 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003587 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3588 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3589 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3590 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003591#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003592 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3593 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3594 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3595 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003596#endif
3597 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003598 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003599 }
3600 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003601 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003602 if (s == e)
3603 break;
3604 ch = (unsigned char)*s;
3605 }
3606 }
3607
3608 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003609 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 s++;
3611 continue;
3612 }
3613
3614 n = utf8_code_length[ch];
3615
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003616 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 if (consumed)
3618 break;
3619 else {
3620 errmsg = "unexpected end of data";
3621 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003622 endinpos = startinpos+1;
3623 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3624 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003625 goto utf8Error;
3626 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628
3629 switch (n) {
3630
3631 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003632 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003633 startinpos = s-starts;
3634 endinpos = startinpos+1;
3635 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636
3637 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003638 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003639 startinpos = s-starts;
3640 endinpos = startinpos+1;
3641 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642
3643 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003644 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003645 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003646 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003647 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003648 goto utf8Error;
3649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003651 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003652 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 break;
3654
3655 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003656 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3657 will result in surrogates in range d800-dfff. Surrogates are
3658 not valid UTF-8 so they are rejected.
3659 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3660 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003661 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003662 (s[2] & 0xc0) != 0x80 ||
3663 ((unsigned char)s[0] == 0xE0 &&
3664 (unsigned char)s[1] < 0xA0) ||
3665 ((unsigned char)s[0] == 0xED &&
3666 (unsigned char)s[1] > 0x9F)) {
3667 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003669 endinpos = startinpos + 1;
3670
3671 /* if s[1] first two bits are 1 and 0, then the invalid
3672 continuation byte is s[2], so increment endinpos by 1,
3673 if not, s[1] is invalid and endinpos doesn't need to
3674 be incremented. */
3675 if ((s[1] & 0xC0) == 0x80)
3676 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 goto utf8Error;
3678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003680 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003681 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003682 break;
3683
3684 case 4:
3685 if ((s[1] & 0xc0) != 0x80 ||
3686 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003687 (s[3] & 0xc0) != 0x80 ||
3688 ((unsigned char)s[0] == 0xF0 &&
3689 (unsigned char)s[1] < 0x90) ||
3690 ((unsigned char)s[0] == 0xF4 &&
3691 (unsigned char)s[1] > 0x8F)) {
3692 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003694 endinpos = startinpos + 1;
3695 if ((s[1] & 0xC0) == 0x80) {
3696 endinpos++;
3697 if ((s[2] & 0xC0) == 0x80)
3698 endinpos++;
3699 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003700 goto utf8Error;
3701 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003702 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003703 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3704 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003706 /* If the string is flexible or we have native UCS-4, write
3707 directly.. */
3708 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3709 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711 else {
3712 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003714 /* translate from 10000..10FFFF to 0..FFFF */
3715 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 /* high surrogate = top 10 bits added to D800 */
3718 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3719 (Py_UNICODE)(0xD800 + (ch >> 10)));
3720
3721 /* low surrogate = bottom 10 bits added to DC00 */
3722 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3723 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3724 }
3725#if SIZEOF_WCHAR_T == 2
3726 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003727#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 }
3730 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003731 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003732
Benjamin Peterson29060642009-01-31 22:14:21 +00003733 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734 /* If this is not yet a resizable string, make it one.. */
3735 if (kind != PyUnicode_WCHAR_KIND) {
3736 const Py_UNICODE *u;
3737 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3738 if (!new_unicode)
3739 goto onError;
3740 u = PyUnicode_AsUnicode((PyObject *)unicode);
3741 if (!u)
3742 goto onError;
3743#if SIZEOF_WCHAR_T == 2
3744 i += wchar_offset;
3745#endif
3746 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3747 Py_DECREF(unicode);
3748 unicode = new_unicode;
3749 kind = 0;
3750 data = PyUnicode_AS_UNICODE(new_unicode);
3751 assert(data != NULL);
3752 }
3753 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003754 if (unicode_decode_call_errorhandler(
3755 errors, &errorHandler,
3756 "utf8", errmsg,
3757 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003759 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003760 /* Update data because unicode_decode_call_errorhandler might have
3761 re-created or resized the unicode object. */
3762 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003763 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 /* Ensure the unicode_size calculation above was correct: */
3766 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3767
Walter Dörwald69652032004-09-07 20:24:22 +00003768 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003769 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 /* Adjust length and ready string when it contained errors and
3772 is of the old resizable kind. */
3773 if (kind == PyUnicode_WCHAR_KIND) {
3774 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3775 PyUnicode_READY(unicode) == -1)
3776 goto onError;
3777 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003779 Py_XDECREF(errorHandler);
3780 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 if (PyUnicode_READY(unicode) == -1) {
3782 Py_DECREF(unicode);
3783 return NULL;
3784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 return (PyObject *)unicode;
3786
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 Py_XDECREF(errorHandler);
3789 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 Py_DECREF(unicode);
3791 return NULL;
3792}
3793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003794#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003795
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003796#ifdef __APPLE__
3797
3798/* Simplified UTF-8 decoder using surrogateescape error handler,
3799 used to decode the command line arguments on Mac OS X. */
3800
3801wchar_t*
3802_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3803{
3804 int n;
3805 const char *e;
3806 wchar_t *unicode, *p;
3807
3808 /* Note: size will always be longer than the resulting Unicode
3809 character count */
3810 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3811 PyErr_NoMemory();
3812 return NULL;
3813 }
3814 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3815 if (!unicode)
3816 return NULL;
3817
3818 /* Unpack UTF-8 encoded data */
3819 p = unicode;
3820 e = s + size;
3821 while (s < e) {
3822 Py_UCS4 ch = (unsigned char)*s;
3823
3824 if (ch < 0x80) {
3825 *p++ = (wchar_t)ch;
3826 s++;
3827 continue;
3828 }
3829
3830 n = utf8_code_length[ch];
3831 if (s + n > e) {
3832 goto surrogateescape;
3833 }
3834
3835 switch (n) {
3836 case 0:
3837 case 1:
3838 goto surrogateescape;
3839
3840 case 2:
3841 if ((s[1] & 0xc0) != 0x80)
3842 goto surrogateescape;
3843 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3844 assert ((ch > 0x007F) && (ch <= 0x07FF));
3845 *p++ = (wchar_t)ch;
3846 break;
3847
3848 case 3:
3849 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3850 will result in surrogates in range d800-dfff. Surrogates are
3851 not valid UTF-8 so they are rejected.
3852 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3853 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3854 if ((s[1] & 0xc0) != 0x80 ||
3855 (s[2] & 0xc0) != 0x80 ||
3856 ((unsigned char)s[0] == 0xE0 &&
3857 (unsigned char)s[1] < 0xA0) ||
3858 ((unsigned char)s[0] == 0xED &&
3859 (unsigned char)s[1] > 0x9F)) {
3860
3861 goto surrogateescape;
3862 }
3863 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3864 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003866 break;
3867
3868 case 4:
3869 if ((s[1] & 0xc0) != 0x80 ||
3870 (s[2] & 0xc0) != 0x80 ||
3871 (s[3] & 0xc0) != 0x80 ||
3872 ((unsigned char)s[0] == 0xF0 &&
3873 (unsigned char)s[1] < 0x90) ||
3874 ((unsigned char)s[0] == 0xF4 &&
3875 (unsigned char)s[1] > 0x8F)) {
3876 goto surrogateescape;
3877 }
3878 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3879 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3880 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3881
3882#if SIZEOF_WCHAR_T == 4
3883 *p++ = (wchar_t)ch;
3884#else
3885 /* compute and append the two surrogates: */
3886
3887 /* translate from 10000..10FFFF to 0..FFFF */
3888 ch -= 0x10000;
3889
3890 /* high surrogate = top 10 bits added to D800 */
3891 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3892
3893 /* low surrogate = bottom 10 bits added to DC00 */
3894 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3895#endif
3896 break;
3897 }
3898 s += n;
3899 continue;
3900
3901 surrogateescape:
3902 *p++ = 0xDC00 + ch;
3903 s++;
3904 }
3905 *p = L'\0';
3906 return unicode;
3907}
3908
3909#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911/* Primary internal function which creates utf8 encoded bytes objects.
3912
3913 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003914 and allocate exactly as much space needed at the end. Else allocate the
3915 maximum possible needed (4 result bytes per Unicode character), and return
3916 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003917*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003918PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920{
Tim Peters602f7402002-04-27 18:03:26 +00003921#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003922
Guido van Rossum98297ee2007-11-06 21:34:58 +00003923 Py_ssize_t i; /* index into s of next input byte */
3924 PyObject *result; /* result string object */
3925 char *p; /* next free byte in output buffer */
3926 Py_ssize_t nallocated; /* number of result bytes allocated */
3927 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003928 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003929 PyObject *errorHandler = NULL;
3930 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931 int kind;
3932 void *data;
3933 Py_ssize_t size;
3934 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3935#if SIZEOF_WCHAR_T == 2
3936 Py_ssize_t wchar_offset = 0;
3937#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939 if (!PyUnicode_Check(unicode)) {
3940 PyErr_BadArgument();
3941 return NULL;
3942 }
3943
3944 if (PyUnicode_READY(unicode) == -1)
3945 return NULL;
3946
3947 if (_PyUnicode_UTF8(unicode))
3948 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
3949 _PyUnicode_UTF8_LENGTH(unicode));
3950
3951 kind = PyUnicode_KIND(unicode);
3952 data = PyUnicode_DATA(unicode);
3953 size = PyUnicode_GET_LENGTH(unicode);
3954
Tim Peters602f7402002-04-27 18:03:26 +00003955 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956
Tim Peters602f7402002-04-27 18:03:26 +00003957 if (size <= MAX_SHORT_UNICHARS) {
3958 /* Write into the stack buffer; nallocated can't overflow.
3959 * At the end, we'll allocate exactly as much heap space as it
3960 * turns out we need.
3961 */
3962 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003963 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00003964 p = stackbuf;
3965 }
3966 else {
3967 /* Overallocate on the heap, and give the excess back at the end. */
3968 nallocated = size * 4;
3969 if (nallocated / 4 != size) /* overflow! */
3970 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00003971 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003972 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00003973 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003974 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003975 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003976
Tim Peters602f7402002-04-27 18:03:26 +00003977 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003979
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003980 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00003981 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003983
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00003985 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00003986 *p++ = (char)(0xc0 | (ch >> 6));
3987 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00003988 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003989 Py_ssize_t newpos;
3990 PyObject *rep;
3991 Py_ssize_t repsize, k, startpos;
3992 startpos = i-1;
3993#if SIZEOF_WCHAR_T == 2
3994 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00003995#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996 rep = unicode_encode_call_errorhandler(
3997 errors, &errorHandler, "utf-8", "surrogates not allowed",
3998 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3999 &exc, startpos, startpos+1, &newpos);
4000 if (!rep)
4001 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003 if (PyBytes_Check(rep))
4004 repsize = PyBytes_GET_SIZE(rep);
4005 else
4006 repsize = PyUnicode_GET_SIZE(rep);
4007
4008 if (repsize > 4) {
4009 Py_ssize_t offset;
4010
4011 if (result == NULL)
4012 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004013 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4017 /* integer overflow */
4018 PyErr_NoMemory();
4019 goto error;
4020 }
4021 nallocated += repsize - 4;
4022 if (result != NULL) {
4023 if (_PyBytes_Resize(&result, nallocated) < 0)
4024 goto error;
4025 } else {
4026 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004027 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028 goto error;
4029 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4030 }
4031 p = PyBytes_AS_STRING(result) + offset;
4032 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034 if (PyBytes_Check(rep)) {
4035 char *prep = PyBytes_AS_STRING(rep);
4036 for(k = repsize; k > 0; k--)
4037 *p++ = *prep++;
4038 } else /* rep is unicode */ {
4039 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4040 Py_UNICODE c;
4041
4042 for(k=0; k<repsize; k++) {
4043 c = prep[k];
4044 if (0x80 <= c) {
4045 raise_encode_exception(&exc, "utf-8",
4046 PyUnicode_AS_UNICODE(unicode),
4047 size, i-1, i,
4048 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004049 goto error;
4050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004052 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004055 } else if (ch < 0x10000) {
4056 *p++ = (char)(0xe0 | (ch >> 12));
4057 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4058 *p++ = (char)(0x80 | (ch & 0x3f));
4059 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004060 /* Encode UCS4 Unicode ordinals */
4061 *p++ = (char)(0xf0 | (ch >> 18));
4062 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4063 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4064 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065#if SIZEOF_WCHAR_T == 2
4066 wchar_offset++;
4067#endif
Tim Peters602f7402002-04-27 18:03:26 +00004068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004070
Guido van Rossum98297ee2007-11-06 21:34:58 +00004071 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004072 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004073 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004074 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004075 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004076 }
4077 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004078 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004079 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004080 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004081 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004084 Py_XDECREF(errorHandler);
4085 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004086 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004087 error:
4088 Py_XDECREF(errorHandler);
4089 Py_XDECREF(exc);
4090 Py_XDECREF(result);
4091 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004092
Tim Peters602f7402002-04-27 18:03:26 +00004093#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094}
4095
Alexander Belopolsky40018472011-02-26 01:02:56 +00004096PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4098 Py_ssize_t size,
4099 const char *errors)
4100{
4101 PyObject *v, *unicode;
4102
4103 unicode = PyUnicode_FromUnicode(s, size);
4104 if (unicode == NULL)
4105 return NULL;
4106 v = _PyUnicode_AsUTF8String(unicode, errors);
4107 Py_DECREF(unicode);
4108 return v;
4109}
4110
4111PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004112PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115}
4116
Walter Dörwald41980ca2007-08-16 21:55:45 +00004117/* --- UTF-32 Codec ------------------------------------------------------- */
4118
4119PyObject *
4120PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004121 Py_ssize_t size,
4122 const char *errors,
4123 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004124{
4125 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4126}
4127
4128PyObject *
4129PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 Py_ssize_t size,
4131 const char *errors,
4132 int *byteorder,
4133 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004134{
4135 const char *starts = s;
4136 Py_ssize_t startinpos;
4137 Py_ssize_t endinpos;
4138 Py_ssize_t outpos;
4139 PyUnicodeObject *unicode;
4140 Py_UNICODE *p;
4141#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004142 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004143 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004144#else
4145 const int pairs = 0;
4146#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004147 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004148 int bo = 0; /* assume native ordering by default */
4149 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004150 /* Offsets from q for retrieving bytes in the right order. */
4151#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4152 int iorder[] = {0, 1, 2, 3};
4153#else
4154 int iorder[] = {3, 2, 1, 0};
4155#endif
4156 PyObject *errorHandler = NULL;
4157 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004158
Walter Dörwald41980ca2007-08-16 21:55:45 +00004159 q = (unsigned char *)s;
4160 e = q + size;
4161
4162 if (byteorder)
4163 bo = *byteorder;
4164
4165 /* Check for BOM marks (U+FEFF) in the input and adjust current
4166 byte order setting accordingly. In native mode, the leading BOM
4167 mark is skipped, in all other modes, it is copied to the output
4168 stream as-is (giving a ZWNBSP character). */
4169 if (bo == 0) {
4170 if (size >= 4) {
4171 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004173#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 if (bom == 0x0000FEFF) {
4175 q += 4;
4176 bo = -1;
4177 }
4178 else if (bom == 0xFFFE0000) {
4179 q += 4;
4180 bo = 1;
4181 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004182#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 if (bom == 0x0000FEFF) {
4184 q += 4;
4185 bo = 1;
4186 }
4187 else if (bom == 0xFFFE0000) {
4188 q += 4;
4189 bo = -1;
4190 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004191#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004193 }
4194
4195 if (bo == -1) {
4196 /* force LE */
4197 iorder[0] = 0;
4198 iorder[1] = 1;
4199 iorder[2] = 2;
4200 iorder[3] = 3;
4201 }
4202 else if (bo == 1) {
4203 /* force BE */
4204 iorder[0] = 3;
4205 iorder[1] = 2;
4206 iorder[2] = 1;
4207 iorder[3] = 0;
4208 }
4209
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004210 /* On narrow builds we split characters outside the BMP into two
4211 codepoints => count how much extra space we need. */
4212#ifndef Py_UNICODE_WIDE
4213 for (qq = q; qq < e; qq += 4)
4214 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4215 pairs++;
4216#endif
4217
4218 /* This might be one to much, because of a BOM */
4219 unicode = _PyUnicode_New((size+3)/4+pairs);
4220 if (!unicode)
4221 return NULL;
4222 if (size == 0)
4223 return (PyObject *)unicode;
4224
4225 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004226 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004227
Walter Dörwald41980ca2007-08-16 21:55:45 +00004228 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 Py_UCS4 ch;
4230 /* remaining bytes at the end? (size should be divisible by 4) */
4231 if (e-q<4) {
4232 if (consumed)
4233 break;
4234 errmsg = "truncated data";
4235 startinpos = ((const char *)q)-starts;
4236 endinpos = ((const char *)e)-starts;
4237 goto utf32Error;
4238 /* The remaining input chars are ignored if the callback
4239 chooses to skip the input */
4240 }
4241 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4242 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004243
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 if (ch >= 0x110000)
4245 {
4246 errmsg = "codepoint not in range(0x110000)";
4247 startinpos = ((const char *)q)-starts;
4248 endinpos = startinpos+4;
4249 goto utf32Error;
4250 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004251#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 if (ch >= 0x10000)
4253 {
4254 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4255 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4256 }
4257 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004258#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 *p++ = ch;
4260 q += 4;
4261 continue;
4262 utf32Error:
4263 outpos = p-PyUnicode_AS_UNICODE(unicode);
4264 if (unicode_decode_call_errorhandler(
4265 errors, &errorHandler,
4266 "utf32", errmsg,
4267 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4268 &unicode, &outpos, &p))
4269 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004270 }
4271
4272 if (byteorder)
4273 *byteorder = bo;
4274
4275 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004277
4278 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004279 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004280 goto onError;
4281
4282 Py_XDECREF(errorHandler);
4283 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004284 if (PyUnicode_READY(unicode) == -1) {
4285 Py_DECREF(unicode);
4286 return NULL;
4287 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004288 return (PyObject *)unicode;
4289
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004291 Py_DECREF(unicode);
4292 Py_XDECREF(errorHandler);
4293 Py_XDECREF(exc);
4294 return NULL;
4295}
4296
4297PyObject *
4298PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 Py_ssize_t size,
4300 const char *errors,
4301 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004302{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004303 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004304 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004305 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004306#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004307 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004308#else
4309 const int pairs = 0;
4310#endif
4311 /* Offsets from p for storing byte pairs in the right order. */
4312#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4313 int iorder[] = {0, 1, 2, 3};
4314#else
4315 int iorder[] = {3, 2, 1, 0};
4316#endif
4317
Benjamin Peterson29060642009-01-31 22:14:21 +00004318#define STORECHAR(CH) \
4319 do { \
4320 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4321 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4322 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4323 p[iorder[0]] = (CH) & 0xff; \
4324 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004325 } while(0)
4326
4327 /* In narrow builds we can output surrogate pairs as one codepoint,
4328 so we need less space. */
4329#ifndef Py_UNICODE_WIDE
4330 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004331 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4332 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4333 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004334#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004335 nsize = (size - pairs + (byteorder == 0));
4336 bytesize = nsize * 4;
4337 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004339 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004340 if (v == NULL)
4341 return NULL;
4342
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004343 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004344 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004345 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004346 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004347 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004348
4349 if (byteorder == -1) {
4350 /* force LE */
4351 iorder[0] = 0;
4352 iorder[1] = 1;
4353 iorder[2] = 2;
4354 iorder[3] = 3;
4355 }
4356 else if (byteorder == 1) {
4357 /* force BE */
4358 iorder[0] = 3;
4359 iorder[1] = 2;
4360 iorder[2] = 1;
4361 iorder[3] = 0;
4362 }
4363
4364 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004366#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4368 Py_UCS4 ch2 = *s;
4369 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4370 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4371 s++;
4372 size--;
4373 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004374 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004375#endif
4376 STORECHAR(ch);
4377 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004378
4379 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004380 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004381#undef STORECHAR
4382}
4383
Alexander Belopolsky40018472011-02-26 01:02:56 +00004384PyObject *
4385PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004386{
4387 if (!PyUnicode_Check(unicode)) {
4388 PyErr_BadArgument();
4389 return NULL;
4390 }
4391 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004392 PyUnicode_GET_SIZE(unicode),
4393 NULL,
4394 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004395}
4396
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397/* --- UTF-16 Codec ------------------------------------------------------- */
4398
Tim Peters772747b2001-08-09 22:21:55 +00004399PyObject *
4400PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004401 Py_ssize_t size,
4402 const char *errors,
4403 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404{
Walter Dörwald69652032004-09-07 20:24:22 +00004405 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4406}
4407
Antoine Pitrouab868312009-01-10 15:40:25 +00004408/* Two masks for fast checking of whether a C 'long' may contain
4409 UTF16-encoded surrogate characters. This is an efficient heuristic,
4410 assuming that non-surrogate characters with a code point >= 0x8000 are
4411 rare in most input.
4412 FAST_CHAR_MASK is used when the input is in native byte ordering,
4413 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004414*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004415#if (SIZEOF_LONG == 8)
4416# define FAST_CHAR_MASK 0x8000800080008000L
4417# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4418#elif (SIZEOF_LONG == 4)
4419# define FAST_CHAR_MASK 0x80008000L
4420# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4421#else
4422# error C 'long' size should be either 4 or 8!
4423#endif
4424
Walter Dörwald69652032004-09-07 20:24:22 +00004425PyObject *
4426PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 Py_ssize_t size,
4428 const char *errors,
4429 int *byteorder,
4430 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004431{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004433 Py_ssize_t startinpos;
4434 Py_ssize_t endinpos;
4435 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436 PyUnicodeObject *unicode;
4437 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004438 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004439 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004440 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004441 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004442 /* Offsets from q for retrieving byte pairs in the right order. */
4443#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4444 int ihi = 1, ilo = 0;
4445#else
4446 int ihi = 0, ilo = 1;
4447#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448 PyObject *errorHandler = NULL;
4449 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450
4451 /* Note: size will always be longer than the resulting Unicode
4452 character count */
4453 unicode = _PyUnicode_New(size);
4454 if (!unicode)
4455 return NULL;
4456 if (size == 0)
4457 return (PyObject *)unicode;
4458
4459 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004460 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004461 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004462 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463
4464 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004465 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004467 /* Check for BOM marks (U+FEFF) in the input and adjust current
4468 byte order setting accordingly. In native mode, the leading BOM
4469 mark is skipped, in all other modes, it is copied to the output
4470 stream as-is (giving a ZWNBSP character). */
4471 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004472 if (size >= 2) {
4473 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004474#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 if (bom == 0xFEFF) {
4476 q += 2;
4477 bo = -1;
4478 }
4479 else if (bom == 0xFFFE) {
4480 q += 2;
4481 bo = 1;
4482 }
Tim Petersced69f82003-09-16 20:30:58 +00004483#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004484 if (bom == 0xFEFF) {
4485 q += 2;
4486 bo = 1;
4487 }
4488 else if (bom == 0xFFFE) {
4489 q += 2;
4490 bo = -1;
4491 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004492#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004494 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495
Tim Peters772747b2001-08-09 22:21:55 +00004496 if (bo == -1) {
4497 /* force LE */
4498 ihi = 1;
4499 ilo = 0;
4500 }
4501 else if (bo == 1) {
4502 /* force BE */
4503 ihi = 0;
4504 ilo = 1;
4505 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004506#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4507 native_ordering = ilo < ihi;
4508#else
4509 native_ordering = ilo > ihi;
4510#endif
Tim Peters772747b2001-08-09 22:21:55 +00004511
Antoine Pitrouab868312009-01-10 15:40:25 +00004512 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004513 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004515 /* First check for possible aligned read of a C 'long'. Unaligned
4516 reads are more expensive, better to defer to another iteration. */
4517 if (!((size_t) q & LONG_PTR_MASK)) {
4518 /* Fast path for runs of non-surrogate chars. */
4519 register const unsigned char *_q = q;
4520 Py_UNICODE *_p = p;
4521 if (native_ordering) {
4522 /* Native ordering is simple: as long as the input cannot
4523 possibly contain a surrogate char, do an unrolled copy
4524 of several 16-bit code points to the target object.
4525 The non-surrogate check is done on several input bytes
4526 at a time (as many as a C 'long' can contain). */
4527 while (_q < aligned_end) {
4528 unsigned long data = * (unsigned long *) _q;
4529 if (data & FAST_CHAR_MASK)
4530 break;
4531 _p[0] = ((unsigned short *) _q)[0];
4532 _p[1] = ((unsigned short *) _q)[1];
4533#if (SIZEOF_LONG == 8)
4534 _p[2] = ((unsigned short *) _q)[2];
4535 _p[3] = ((unsigned short *) _q)[3];
4536#endif
4537 _q += SIZEOF_LONG;
4538 _p += SIZEOF_LONG / 2;
4539 }
4540 }
4541 else {
4542 /* Byteswapped ordering is similar, but we must decompose
4543 the copy bytewise, and take care of zero'ing out the
4544 upper bytes if the target object is in 32-bit units
4545 (that is, in UCS-4 builds). */
4546 while (_q < aligned_end) {
4547 unsigned long data = * (unsigned long *) _q;
4548 if (data & SWAPPED_FAST_CHAR_MASK)
4549 break;
4550 /* Zero upper bytes in UCS-4 builds */
4551#if (Py_UNICODE_SIZE > 2)
4552 _p[0] = 0;
4553 _p[1] = 0;
4554#if (SIZEOF_LONG == 8)
4555 _p[2] = 0;
4556 _p[3] = 0;
4557#endif
4558#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004559 /* Issue #4916; UCS-4 builds on big endian machines must
4560 fill the two last bytes of each 4-byte unit. */
4561#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4562# define OFF 2
4563#else
4564# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004565#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004566 ((unsigned char *) _p)[OFF + 1] = _q[0];
4567 ((unsigned char *) _p)[OFF + 0] = _q[1];
4568 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4569 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4570#if (SIZEOF_LONG == 8)
4571 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4572 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4573 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4574 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4575#endif
4576#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004577 _q += SIZEOF_LONG;
4578 _p += SIZEOF_LONG / 2;
4579 }
4580 }
4581 p = _p;
4582 q = _q;
4583 if (q >= e)
4584 break;
4585 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587
Benjamin Peterson14339b62009-01-31 16:36:08 +00004588 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004589
4590 if (ch < 0xD800 || ch > 0xDFFF) {
4591 *p++ = ch;
4592 continue;
4593 }
4594
4595 /* UTF-16 code pair: */
4596 if (q > e) {
4597 errmsg = "unexpected end of data";
4598 startinpos = (((const char *)q) - 2) - starts;
4599 endinpos = ((const char *)e) + 1 - starts;
4600 goto utf16Error;
4601 }
4602 if (0xD800 <= ch && ch <= 0xDBFF) {
4603 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4604 q += 2;
4605 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004606#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 *p++ = ch;
4608 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004609#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004611#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 continue;
4613 }
4614 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004615 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 startinpos = (((const char *)q)-4)-starts;
4617 endinpos = startinpos+2;
4618 goto utf16Error;
4619 }
4620
Benjamin Peterson14339b62009-01-31 16:36:08 +00004621 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004622 errmsg = "illegal encoding";
4623 startinpos = (((const char *)q)-2)-starts;
4624 endinpos = startinpos+2;
4625 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004626
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 utf16Error:
4628 outpos = p - PyUnicode_AS_UNICODE(unicode);
4629 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004630 errors,
4631 &errorHandler,
4632 "utf16", errmsg,
4633 &starts,
4634 (const char **)&e,
4635 &startinpos,
4636 &endinpos,
4637 &exc,
4638 (const char **)&q,
4639 &unicode,
4640 &outpos,
4641 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004644 /* remaining byte at the end? (size should be even) */
4645 if (e == q) {
4646 if (!consumed) {
4647 errmsg = "truncated data";
4648 startinpos = ((const char *)q) - starts;
4649 endinpos = ((const char *)e) + 1 - starts;
4650 outpos = p - PyUnicode_AS_UNICODE(unicode);
4651 if (unicode_decode_call_errorhandler(
4652 errors,
4653 &errorHandler,
4654 "utf16", errmsg,
4655 &starts,
4656 (const char **)&e,
4657 &startinpos,
4658 &endinpos,
4659 &exc,
4660 (const char **)&q,
4661 &unicode,
4662 &outpos,
4663 &p))
4664 goto onError;
4665 /* The remaining input chars are ignored if the callback
4666 chooses to skip the input */
4667 }
4668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669
4670 if (byteorder)
4671 *byteorder = bo;
4672
Walter Dörwald69652032004-09-07 20:24:22 +00004673 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004675
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004677 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678 goto onError;
4679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 Py_XDECREF(errorHandler);
4681 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004682 if (PyUnicode_READY(unicode) == -1) {
4683 Py_DECREF(unicode);
4684 return NULL;
4685 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686 return (PyObject *)unicode;
4687
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 Py_XDECREF(errorHandler);
4691 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 return NULL;
4693}
4694
Antoine Pitrouab868312009-01-10 15:40:25 +00004695#undef FAST_CHAR_MASK
4696#undef SWAPPED_FAST_CHAR_MASK
4697
Tim Peters772747b2001-08-09 22:21:55 +00004698PyObject *
4699PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004700 Py_ssize_t size,
4701 const char *errors,
4702 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004704 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004705 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004706 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004707#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004708 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004709#else
4710 const int pairs = 0;
4711#endif
Tim Peters772747b2001-08-09 22:21:55 +00004712 /* Offsets from p for storing byte pairs in the right order. */
4713#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4714 int ihi = 1, ilo = 0;
4715#else
4716 int ihi = 0, ilo = 1;
4717#endif
4718
Benjamin Peterson29060642009-01-31 22:14:21 +00004719#define STORECHAR(CH) \
4720 do { \
4721 p[ihi] = ((CH) >> 8) & 0xff; \
4722 p[ilo] = (CH) & 0xff; \
4723 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004724 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004726#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004727 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004728 if (s[i] >= 0x10000)
4729 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004730#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004731 /* 2 * (size + pairs + (byteorder == 0)) */
4732 if (size > PY_SSIZE_T_MAX ||
4733 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004735 nsize = size + pairs + (byteorder == 0);
4736 bytesize = nsize * 2;
4737 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004738 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004739 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 if (v == NULL)
4741 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004743 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004745 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004746 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004747 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004748
4749 if (byteorder == -1) {
4750 /* force LE */
4751 ihi = 1;
4752 ilo = 0;
4753 }
4754 else if (byteorder == 1) {
4755 /* force BE */
4756 ihi = 0;
4757 ilo = 1;
4758 }
4759
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004760 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 Py_UNICODE ch = *s++;
4762 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004763#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 if (ch >= 0x10000) {
4765 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4766 ch = 0xD800 | ((ch-0x10000) >> 10);
4767 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004768#endif
Tim Peters772747b2001-08-09 22:21:55 +00004769 STORECHAR(ch);
4770 if (ch2)
4771 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004772 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004773
4774 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004775 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004776#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777}
4778
Alexander Belopolsky40018472011-02-26 01:02:56 +00004779PyObject *
4780PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781{
4782 if (!PyUnicode_Check(unicode)) {
4783 PyErr_BadArgument();
4784 return NULL;
4785 }
4786 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 PyUnicode_GET_SIZE(unicode),
4788 NULL,
4789 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790}
4791
4792/* --- Unicode Escape Codec ----------------------------------------------- */
4793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004794/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4795 if all the escapes in the string make it still a valid ASCII string.
4796 Returns -1 if any escapes were found which cause the string to
4797 pop out of ASCII range. Otherwise returns the length of the
4798 required buffer to hold the string.
4799 */
4800Py_ssize_t
4801length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4802{
4803 const unsigned char *p = (const unsigned char *)s;
4804 const unsigned char *end = p + size;
4805 Py_ssize_t length = 0;
4806
4807 if (size < 0)
4808 return -1;
4809
4810 for (; p < end; ++p) {
4811 if (*p > 127) {
4812 /* Non-ASCII */
4813 return -1;
4814 }
4815 else if (*p != '\\') {
4816 /* Normal character */
4817 ++length;
4818 }
4819 else {
4820 /* Backslash-escape, check next char */
4821 ++p;
4822 /* Escape sequence reaches till end of string or
4823 non-ASCII follow-up. */
4824 if (p >= end || *p > 127)
4825 return -1;
4826 switch (*p) {
4827 case '\n':
4828 /* backslash + \n result in zero characters */
4829 break;
4830 case '\\': case '\'': case '\"':
4831 case 'b': case 'f': case 't':
4832 case 'n': case 'r': case 'v': case 'a':
4833 ++length;
4834 break;
4835 case '0': case '1': case '2': case '3':
4836 case '4': case '5': case '6': case '7':
4837 case 'x': case 'u': case 'U': case 'N':
4838 /* these do not guarantee ASCII characters */
4839 return -1;
4840 default:
4841 /* count the backslash + the other character */
4842 length += 2;
4843 }
4844 }
4845 }
4846 return length;
4847}
4848
4849/* Similar to PyUnicode_WRITE but either write into wstr field
4850 or treat string as ASCII. */
4851#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4852 do { \
4853 if ((kind) != PyUnicode_WCHAR_KIND) \
4854 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4855 else \
4856 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4857 } while (0)
4858
4859#define WRITE_WSTR(buf, index, value) \
4860 assert(kind == PyUnicode_WCHAR_KIND), \
4861 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4862
4863
Fredrik Lundh06d12682001-01-24 07:59:11 +00004864static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004865
Alexander Belopolsky40018472011-02-26 01:02:56 +00004866PyObject *
4867PyUnicode_DecodeUnicodeEscape(const char *s,
4868 Py_ssize_t size,
4869 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004872 Py_ssize_t startinpos;
4873 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004874 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004878 char* message;
4879 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004880 PyObject *errorHandler = NULL;
4881 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882 Py_ssize_t ascii_length;
4883 Py_ssize_t i;
4884 int kind;
4885 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004887 ascii_length = length_of_escaped_ascii_string(s, size);
4888
4889 /* After length_of_escaped_ascii_string() there are two alternatives,
4890 either the string is pure ASCII with named escapes like \n, etc.
4891 and we determined it's exact size (common case)
4892 or it contains \x, \u, ... escape sequences. then we create a
4893 legacy wchar string and resize it at the end of this function. */
4894 if (ascii_length >= 0) {
4895 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4896 if (!v)
4897 goto onError;
4898 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4899 kind = PyUnicode_1BYTE_KIND;
4900 data = PyUnicode_DATA(v);
4901 }
4902 else {
4903 /* Escaped strings will always be longer than the resulting
4904 Unicode string, so we start with size here and then reduce the
4905 length after conversion to the true value.
4906 (but if the error callback returns a long replacement string
4907 we'll have to allocate more space) */
4908 v = _PyUnicode_New(size);
4909 if (!v)
4910 goto onError;
4911 kind = PyUnicode_WCHAR_KIND;
4912 data = PyUnicode_AS_UNICODE(v);
4913 }
4914
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915 if (size == 0)
4916 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004917 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004919
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920 while (s < end) {
4921 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004922 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004925 if (kind == PyUnicode_WCHAR_KIND) {
4926 assert(i < _PyUnicode_WSTR_LENGTH(v));
4927 }
4928 else {
4929 /* The only case in which i == ascii_length is a backslash
4930 followed by a newline. */
4931 assert(i <= ascii_length);
4932 }
4933
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 /* Non-escape characters are interpreted as Unicode ordinals */
4935 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004936 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 continue;
4938 }
4939
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004940 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 /* \ - Escapes */
4942 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004943 c = *s++;
4944 if (s > end)
4945 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004946
4947 if (kind == PyUnicode_WCHAR_KIND) {
4948 assert(i < _PyUnicode_WSTR_LENGTH(v));
4949 }
4950 else {
4951 /* The only case in which i == ascii_length is a backslash
4952 followed by a newline. */
4953 assert(i < ascii_length || (i == ascii_length && c == '\n'));
4954 }
4955
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004956 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004960 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
4961 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
4962 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
4963 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
4964 /* FF */
4965 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
4966 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
4967 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
4968 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
4969 /* VT */
4970 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
4971 /* BEL, not classic C */
4972 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 case '0': case '1': case '2': case '3':
4976 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004977 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004978 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004979 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004980 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004981 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004983 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 break;
4985
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 /* hex escapes */
4987 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00004989 digits = 2;
4990 message = "truncated \\xXX escape";
4991 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00004995 digits = 4;
4996 message = "truncated \\uXXXX escape";
4997 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998
Benjamin Peterson29060642009-01-31 22:14:21 +00004999 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005000 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005001 digits = 8;
5002 message = "truncated \\UXXXXXXXX escape";
5003 hexescape:
5004 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005005 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006 if (s+digits>end) {
5007 endinpos = size;
5008 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 errors, &errorHandler,
5010 "unicodeescape", "end of string in escape sequence",
5011 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005012 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005013 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005014 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005015 goto nextByte;
5016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005017 for (j = 0; j < digits; ++j) {
5018 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005019 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005020 endinpos = (s+j+1)-starts;
5021 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 errors, &errorHandler,
5024 "unicodeescape", message,
5025 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005026 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005027 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005028 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005029 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005030 }
5031 chr = (chr<<4) & ~0xF;
5032 if (c >= '0' && c <= '9')
5033 chr += c - '0';
5034 else if (c >= 'a' && c <= 'f')
5035 chr += 10 + c - 'a';
5036 else
5037 chr += 10 + c - 'A';
5038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005039 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005040 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005041 /* _decoding_error will have already written into the
5042 target buffer. */
5043 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005044 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005045 /* when we get here, chr is a 32-bit unicode character */
5046 if (chr <= 0xffff)
5047 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005048 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005049 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005050 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005051 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005052#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005053 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005054#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005055 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005056 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5057 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005058#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005059 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005060 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005061 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005062 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 errors, &errorHandler,
5064 "unicodeescape", "illegal Unicode character",
5065 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005066 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005067 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005068 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005069 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005070 break;
5071
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005073 case 'N':
5074 message = "malformed \\N character escape";
5075 if (ucnhash_CAPI == NULL) {
5076 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005077 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5078 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005079 if (ucnhash_CAPI == NULL)
5080 goto ucnhashError;
5081 }
5082 if (*s == '{') {
5083 const char *start = s+1;
5084 /* look for the closing brace */
5085 while (*s != '}' && s < end)
5086 s++;
5087 if (s > start && s < end && *s == '}') {
5088 /* found a name. look it up in the unicode database */
5089 message = "unknown Unicode character name";
5090 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005091 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5092 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005093 goto store;
5094 }
5095 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005096 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005097 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005098 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 errors, &errorHandler,
5100 "unicodeescape", message,
5101 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005102 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005103 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005104 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005105 break;
5106
5107 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005108 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005109 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005110 message = "\\ at end of string";
5111 s--;
5112 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005113 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005115 errors, &errorHandler,
5116 "unicodeescape", message,
5117 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005118 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005119 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005120 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005121 }
5122 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005123 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5124 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005125 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005126 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005128 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005131 /* Ensure the length prediction worked in case of ASCII strings */
5132 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5133
5134 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5135 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005136 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005137 Py_XDECREF(errorHandler);
5138 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005140
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005142 PyErr_SetString(
5143 PyExc_UnicodeError,
5144 "\\N escapes not supported (can't load unicodedata module)"
5145 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005146 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005147 Py_XDECREF(errorHandler);
5148 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005149 return NULL;
5150
Benjamin Peterson29060642009-01-31 22:14:21 +00005151 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005153 Py_XDECREF(errorHandler);
5154 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 return NULL;
5156}
5157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005158#undef WRITE_ASCII_OR_WSTR
5159#undef WRITE_WSTR
5160
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161/* Return a Unicode-Escape string version of the Unicode object.
5162
5163 If quotes is true, the string is enclosed in u"" or u'' quotes as
5164 appropriate.
5165
5166*/
5167
Walter Dörwald79e913e2007-05-12 11:08:06 +00005168static const char *hexdigits = "0123456789abcdef";
5169
Alexander Belopolsky40018472011-02-26 01:02:56 +00005170PyObject *
5171PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5172 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005174 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005177#ifdef Py_UNICODE_WIDE
5178 const Py_ssize_t expandsize = 10;
5179#else
5180 const Py_ssize_t expandsize = 6;
5181#endif
5182
Thomas Wouters89f507f2006-12-13 04:49:30 +00005183 /* XXX(nnorwitz): rather than over-allocating, it would be
5184 better to choose a different scheme. Perhaps scan the
5185 first N-chars of the string and allocate based on that size.
5186 */
5187 /* Initial allocation is based on the longest-possible unichr
5188 escape.
5189
5190 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5191 unichr, so in this case it's the longest unichr escape. In
5192 narrow (UTF-16) builds this is five chars per source unichr
5193 since there are two unichrs in the surrogate pair, so in narrow
5194 (UTF-16) builds it's not the longest unichr escape.
5195
5196 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5197 so in the narrow (UTF-16) build case it's the longest unichr
5198 escape.
5199 */
5200
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005201 if (size == 0)
5202 return PyBytes_FromStringAndSize(NULL, 0);
5203
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005204 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005206
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005207 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 2
5209 + expandsize*size
5210 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 if (repr == NULL)
5212 return NULL;
5213
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005214 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 while (size-- > 0) {
5217 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005218
Walter Dörwald79e913e2007-05-12 11:08:06 +00005219 /* Escape backslashes */
5220 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221 *p++ = '\\';
5222 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005223 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005224 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005225
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005226#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005227 /* Map 21-bit characters to '\U00xxxxxx' */
5228 else if (ch >= 0x10000) {
5229 *p++ = '\\';
5230 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005231 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5232 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5233 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5234 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5235 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5236 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5237 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5238 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005240 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005241#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5243 else if (ch >= 0xD800 && ch < 0xDC00) {
5244 Py_UNICODE ch2;
5245 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005246
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 ch2 = *s++;
5248 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005249 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5251 *p++ = '\\';
5252 *p++ = 'U';
5253 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5254 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5255 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5256 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5257 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5258 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5259 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5260 *p++ = hexdigits[ucs & 0x0000000F];
5261 continue;
5262 }
5263 /* Fall through: isolated surrogates are copied as-is */
5264 s--;
5265 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005266 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005267#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005268
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005270 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 *p++ = '\\';
5272 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005273 *p++ = hexdigits[(ch >> 12) & 0x000F];
5274 *p++ = hexdigits[(ch >> 8) & 0x000F];
5275 *p++ = hexdigits[(ch >> 4) & 0x000F];
5276 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005278
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005279 /* Map special whitespace to '\t', \n', '\r' */
5280 else if (ch == '\t') {
5281 *p++ = '\\';
5282 *p++ = 't';
5283 }
5284 else if (ch == '\n') {
5285 *p++ = '\\';
5286 *p++ = 'n';
5287 }
5288 else if (ch == '\r') {
5289 *p++ = '\\';
5290 *p++ = 'r';
5291 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005292
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005293 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005294 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005296 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005297 *p++ = hexdigits[(ch >> 4) & 0x000F];
5298 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005299 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005300
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 /* Copy everything else as-is */
5302 else
5303 *p++ = (char) ch;
5304 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005306 assert(p - PyBytes_AS_STRING(repr) > 0);
5307 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5308 return NULL;
5309 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310}
5311
Alexander Belopolsky40018472011-02-26 01:02:56 +00005312PyObject *
5313PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005315 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 if (!PyUnicode_Check(unicode)) {
5317 PyErr_BadArgument();
5318 return NULL;
5319 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005320 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5321 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005322 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323}
5324
5325/* --- Raw Unicode Escape Codec ------------------------------------------- */
5326
Alexander Belopolsky40018472011-02-26 01:02:56 +00005327PyObject *
5328PyUnicode_DecodeRawUnicodeEscape(const char *s,
5329 Py_ssize_t size,
5330 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005332 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005333 Py_ssize_t startinpos;
5334 Py_ssize_t endinpos;
5335 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 const char *end;
5339 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340 PyObject *errorHandler = NULL;
5341 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005342
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 /* Escaped strings will always be longer than the resulting
5344 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005345 length after conversion to the true value. (But decoding error
5346 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 v = _PyUnicode_New(size);
5348 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005352 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 end = s + size;
5354 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 unsigned char c;
5356 Py_UCS4 x;
5357 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005358 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 /* Non-escape characters are interpreted as Unicode ordinals */
5361 if (*s != '\\') {
5362 *p++ = (unsigned char)*s++;
5363 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005364 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 startinpos = s-starts;
5366
5367 /* \u-escapes are only interpreted iff the number of leading
5368 backslashes if odd */
5369 bs = s;
5370 for (;s < end;) {
5371 if (*s != '\\')
5372 break;
5373 *p++ = (unsigned char)*s++;
5374 }
5375 if (((s - bs) & 1) == 0 ||
5376 s >= end ||
5377 (*s != 'u' && *s != 'U')) {
5378 continue;
5379 }
5380 p--;
5381 count = *s=='u' ? 4 : 8;
5382 s++;
5383
5384 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5385 outpos = p-PyUnicode_AS_UNICODE(v);
5386 for (x = 0, i = 0; i < count; ++i, ++s) {
5387 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005388 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 endinpos = s-starts;
5390 if (unicode_decode_call_errorhandler(
5391 errors, &errorHandler,
5392 "rawunicodeescape", "truncated \\uXXXX",
5393 &starts, &end, &startinpos, &endinpos, &exc, &s,
5394 &v, &outpos, &p))
5395 goto onError;
5396 goto nextByte;
5397 }
5398 x = (x<<4) & ~0xF;
5399 if (c >= '0' && c <= '9')
5400 x += c - '0';
5401 else if (c >= 'a' && c <= 'f')
5402 x += 10 + c - 'a';
5403 else
5404 x += 10 + c - 'A';
5405 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005406 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 /* UCS-2 character */
5408 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005409 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 /* UCS-4 character. Either store directly, or as
5411 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005412#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005414#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 x -= 0x10000L;
5416 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5417 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005418#endif
5419 } else {
5420 endinpos = s-starts;
5421 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005422 if (unicode_decode_call_errorhandler(
5423 errors, &errorHandler,
5424 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 &starts, &end, &startinpos, &endinpos, &exc, &s,
5426 &v, &outpos, &p))
5427 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005428 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 nextByte:
5430 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005432 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 Py_XDECREF(errorHandler);
5435 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005436 if (PyUnicode_READY(v) == -1) {
5437 Py_DECREF(v);
5438 return NULL;
5439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005441
Benjamin Peterson29060642009-01-31 22:14:21 +00005442 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444 Py_XDECREF(errorHandler);
5445 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 return NULL;
5447}
5448
Alexander Belopolsky40018472011-02-26 01:02:56 +00005449PyObject *
5450PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5451 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005453 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 char *p;
5455 char *q;
5456
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005457#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005458 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005459#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005460 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005461#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005462
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005463 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005465
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005466 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 if (repr == NULL)
5468 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005469 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005470 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005472 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 while (size-- > 0) {
5474 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005475#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 /* Map 32-bit characters to '\Uxxxxxxxx' */
5477 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005478 *p++ = '\\';
5479 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005480 *p++ = hexdigits[(ch >> 28) & 0xf];
5481 *p++ = hexdigits[(ch >> 24) & 0xf];
5482 *p++ = hexdigits[(ch >> 20) & 0xf];
5483 *p++ = hexdigits[(ch >> 16) & 0xf];
5484 *p++ = hexdigits[(ch >> 12) & 0xf];
5485 *p++ = hexdigits[(ch >> 8) & 0xf];
5486 *p++ = hexdigits[(ch >> 4) & 0xf];
5487 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005488 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005489 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005490#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5492 if (ch >= 0xD800 && ch < 0xDC00) {
5493 Py_UNICODE ch2;
5494 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005495
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 ch2 = *s++;
5497 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005498 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5500 *p++ = '\\';
5501 *p++ = 'U';
5502 *p++ = hexdigits[(ucs >> 28) & 0xf];
5503 *p++ = hexdigits[(ucs >> 24) & 0xf];
5504 *p++ = hexdigits[(ucs >> 20) & 0xf];
5505 *p++ = hexdigits[(ucs >> 16) & 0xf];
5506 *p++ = hexdigits[(ucs >> 12) & 0xf];
5507 *p++ = hexdigits[(ucs >> 8) & 0xf];
5508 *p++ = hexdigits[(ucs >> 4) & 0xf];
5509 *p++ = hexdigits[ucs & 0xf];
5510 continue;
5511 }
5512 /* Fall through: isolated surrogates are copied as-is */
5513 s--;
5514 size++;
5515 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005516#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 /* Map 16-bit characters to '\uxxxx' */
5518 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 *p++ = '\\';
5520 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005521 *p++ = hexdigits[(ch >> 12) & 0xf];
5522 *p++ = hexdigits[(ch >> 8) & 0xf];
5523 *p++ = hexdigits[(ch >> 4) & 0xf];
5524 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005526 /* Copy everything else as-is */
5527 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 *p++ = (char) ch;
5529 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005530 size = p - q;
5531
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005532 assert(size > 0);
5533 if (_PyBytes_Resize(&repr, size) < 0)
5534 return NULL;
5535 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536}
5537
Alexander Belopolsky40018472011-02-26 01:02:56 +00005538PyObject *
5539PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005541 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005543 PyErr_BadArgument();
5544 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005546 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5547 PyUnicode_GET_SIZE(unicode));
5548
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005549 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550}
5551
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005552/* --- Unicode Internal Codec ------------------------------------------- */
5553
Alexander Belopolsky40018472011-02-26 01:02:56 +00005554PyObject *
5555_PyUnicode_DecodeUnicodeInternal(const char *s,
5556 Py_ssize_t size,
5557 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005558{
5559 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005560 Py_ssize_t startinpos;
5561 Py_ssize_t endinpos;
5562 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005563 PyUnicodeObject *v;
5564 Py_UNICODE *p;
5565 const char *end;
5566 const char *reason;
5567 PyObject *errorHandler = NULL;
5568 PyObject *exc = NULL;
5569
Neal Norwitzd43069c2006-01-08 01:12:10 +00005570#ifdef Py_UNICODE_WIDE
5571 Py_UNICODE unimax = PyUnicode_GetMax();
5572#endif
5573
Thomas Wouters89f507f2006-12-13 04:49:30 +00005574 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005575 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5576 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005578 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5579 as string was created with the old API. */
5580 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005582 p = PyUnicode_AS_UNICODE(v);
5583 end = s + size;
5584
5585 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005586 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005587 /* We have to sanity check the raw data, otherwise doom looms for
5588 some malformed UCS-4 data. */
5589 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005590#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005591 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005592#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005593 end-s < Py_UNICODE_SIZE
5594 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005596 startinpos = s - starts;
5597 if (end-s < Py_UNICODE_SIZE) {
5598 endinpos = end-starts;
5599 reason = "truncated input";
5600 }
5601 else {
5602 endinpos = s - starts + Py_UNICODE_SIZE;
5603 reason = "illegal code point (> 0x10FFFF)";
5604 }
5605 outpos = p - PyUnicode_AS_UNICODE(v);
5606 if (unicode_decode_call_errorhandler(
5607 errors, &errorHandler,
5608 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005609 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005610 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005611 goto onError;
5612 }
5613 }
5614 else {
5615 p++;
5616 s += Py_UNICODE_SIZE;
5617 }
5618 }
5619
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005620 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005621 goto onError;
5622 Py_XDECREF(errorHandler);
5623 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624 if (PyUnicode_READY(v) == -1) {
5625 Py_DECREF(v);
5626 return NULL;
5627 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005628 return (PyObject *)v;
5629
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005631 Py_XDECREF(v);
5632 Py_XDECREF(errorHandler);
5633 Py_XDECREF(exc);
5634 return NULL;
5635}
5636
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637/* --- Latin-1 Codec ------------------------------------------------------ */
5638
Alexander Belopolsky40018472011-02-26 01:02:56 +00005639PyObject *
5640PyUnicode_DecodeLatin1(const char *s,
5641 Py_ssize_t size,
5642 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005645 return PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646}
5647
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005649static void
5650make_encode_exception(PyObject **exceptionObject,
5651 const char *encoding,
5652 const Py_UNICODE *unicode, Py_ssize_t size,
5653 Py_ssize_t startpos, Py_ssize_t endpos,
5654 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 *exceptionObject = PyUnicodeEncodeError_Create(
5658 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 }
5660 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5662 goto onError;
5663 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5664 goto onError;
5665 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5666 goto onError;
5667 return;
5668 onError:
5669 Py_DECREF(*exceptionObject);
5670 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 }
5672}
5673
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005674/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005675static void
5676raise_encode_exception(PyObject **exceptionObject,
5677 const char *encoding,
5678 const Py_UNICODE *unicode, Py_ssize_t size,
5679 Py_ssize_t startpos, Py_ssize_t endpos,
5680 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005681{
5682 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005684 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686}
5687
5688/* error handling callback helper:
5689 build arguments, call the callback and check the arguments,
5690 put the result into newpos and return the replacement string, which
5691 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005692static PyObject *
5693unicode_encode_call_errorhandler(const char *errors,
5694 PyObject **errorHandler,
5695 const char *encoding, const char *reason,
5696 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5697 Py_ssize_t startpos, Py_ssize_t endpos,
5698 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005700 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701
5702 PyObject *restuple;
5703 PyObject *resunicode;
5704
5705 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005707 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 }
5710
5711 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715
5716 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005721 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 Py_DECREF(restuple);
5723 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005725 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 &resunicode, newpos)) {
5727 Py_DECREF(restuple);
5728 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005729 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005730 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5731 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5732 Py_DECREF(restuple);
5733 return NULL;
5734 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005737 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5739 Py_DECREF(restuple);
5740 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 Py_INCREF(resunicode);
5743 Py_DECREF(restuple);
5744 return resunicode;
5745}
5746
Alexander Belopolsky40018472011-02-26 01:02:56 +00005747static PyObject *
5748unicode_encode_ucs1(const Py_UNICODE *p,
5749 Py_ssize_t size,
5750 const char *errors,
5751 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752{
5753 /* output object */
5754 PyObject *res;
5755 /* pointers to the beginning and end+1 of input */
5756 const Py_UNICODE *startp = p;
5757 const Py_UNICODE *endp = p + size;
5758 /* pointer to the beginning of the unencodable characters */
5759 /* const Py_UNICODE *badp = NULL; */
5760 /* pointer into the output */
5761 char *str;
5762 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005763 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005764 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5765 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005766 PyObject *errorHandler = NULL;
5767 PyObject *exc = NULL;
5768 /* the following variable is used for caching string comparisons
5769 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5770 int known_errorHandler = -1;
5771
5772 /* allocate enough for a simple encoding without
5773 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005774 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005775 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005776 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005778 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005779 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 ressize = size;
5781
5782 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 /* can we encode this? */
5786 if (c<limit) {
5787 /* no overflow check, because we know that the space is enough */
5788 *str++ = (char)c;
5789 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005790 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 else {
5792 Py_ssize_t unicodepos = p-startp;
5793 Py_ssize_t requiredsize;
5794 PyObject *repunicode;
5795 Py_ssize_t repsize;
5796 Py_ssize_t newpos;
5797 Py_ssize_t respos;
5798 Py_UNICODE *uni2;
5799 /* startpos for collecting unencodable chars */
5800 const Py_UNICODE *collstart = p;
5801 const Py_UNICODE *collend = p;
5802 /* find all unecodable characters */
5803 while ((collend < endp) && ((*collend)>=limit))
5804 ++collend;
5805 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5806 if (known_errorHandler==-1) {
5807 if ((errors==NULL) || (!strcmp(errors, "strict")))
5808 known_errorHandler = 1;
5809 else if (!strcmp(errors, "replace"))
5810 known_errorHandler = 2;
5811 else if (!strcmp(errors, "ignore"))
5812 known_errorHandler = 3;
5813 else if (!strcmp(errors, "xmlcharrefreplace"))
5814 known_errorHandler = 4;
5815 else
5816 known_errorHandler = 0;
5817 }
5818 switch (known_errorHandler) {
5819 case 1: /* strict */
5820 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5821 goto onError;
5822 case 2: /* replace */
5823 while (collstart++<collend)
5824 *str++ = '?'; /* fall through */
5825 case 3: /* ignore */
5826 p = collend;
5827 break;
5828 case 4: /* xmlcharrefreplace */
5829 respos = str - PyBytes_AS_STRING(res);
5830 /* determine replacement size (temporarily (mis)uses p) */
5831 for (p = collstart, repsize = 0; p < collend; ++p) {
5832 if (*p<10)
5833 repsize += 2+1+1;
5834 else if (*p<100)
5835 repsize += 2+2+1;
5836 else if (*p<1000)
5837 repsize += 2+3+1;
5838 else if (*p<10000)
5839 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005840#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 else
5842 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005843#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 else if (*p<100000)
5845 repsize += 2+5+1;
5846 else if (*p<1000000)
5847 repsize += 2+6+1;
5848 else
5849 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005850#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 }
5852 requiredsize = respos+repsize+(endp-collend);
5853 if (requiredsize > ressize) {
5854 if (requiredsize<2*ressize)
5855 requiredsize = 2*ressize;
5856 if (_PyBytes_Resize(&res, requiredsize))
5857 goto onError;
5858 str = PyBytes_AS_STRING(res) + respos;
5859 ressize = requiredsize;
5860 }
5861 /* generate replacement (temporarily (mis)uses p) */
5862 for (p = collstart; p < collend; ++p) {
5863 str += sprintf(str, "&#%d;", (int)*p);
5864 }
5865 p = collend;
5866 break;
5867 default:
5868 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5869 encoding, reason, startp, size, &exc,
5870 collstart-startp, collend-startp, &newpos);
5871 if (repunicode == NULL)
5872 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005873 if (PyBytes_Check(repunicode)) {
5874 /* Directly copy bytes result to output. */
5875 repsize = PyBytes_Size(repunicode);
5876 if (repsize > 1) {
5877 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005878 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005879 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5880 Py_DECREF(repunicode);
5881 goto onError;
5882 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005883 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005884 ressize += repsize-1;
5885 }
5886 memcpy(str, PyBytes_AsString(repunicode), repsize);
5887 str += repsize;
5888 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005889 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005890 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005891 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 /* need more space? (at least enough for what we
5893 have+the replacement+the rest of the string, so
5894 we won't have to check space for encodable characters) */
5895 respos = str - PyBytes_AS_STRING(res);
5896 repsize = PyUnicode_GET_SIZE(repunicode);
5897 requiredsize = respos+repsize+(endp-collend);
5898 if (requiredsize > ressize) {
5899 if (requiredsize<2*ressize)
5900 requiredsize = 2*ressize;
5901 if (_PyBytes_Resize(&res, requiredsize)) {
5902 Py_DECREF(repunicode);
5903 goto onError;
5904 }
5905 str = PyBytes_AS_STRING(res) + respos;
5906 ressize = requiredsize;
5907 }
5908 /* check if there is anything unencodable in the replacement
5909 and copy it to the output */
5910 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5911 c = *uni2;
5912 if (c >= limit) {
5913 raise_encode_exception(&exc, encoding, startp, size,
5914 unicodepos, unicodepos+1, reason);
5915 Py_DECREF(repunicode);
5916 goto onError;
5917 }
5918 *str = (char)c;
5919 }
5920 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005921 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005922 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005923 }
5924 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005925 /* Resize if we allocated to much */
5926 size = str - PyBytes_AS_STRING(res);
5927 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005928 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005929 if (_PyBytes_Resize(&res, size) < 0)
5930 goto onError;
5931 }
5932
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005933 Py_XDECREF(errorHandler);
5934 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005935 return res;
5936
5937 onError:
5938 Py_XDECREF(res);
5939 Py_XDECREF(errorHandler);
5940 Py_XDECREF(exc);
5941 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005942}
5943
Alexander Belopolsky40018472011-02-26 01:02:56 +00005944PyObject *
5945PyUnicode_EncodeLatin1(const Py_UNICODE *p,
5946 Py_ssize_t size,
5947 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005949 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950}
5951
Alexander Belopolsky40018472011-02-26 01:02:56 +00005952PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005953_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954{
5955 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 PyErr_BadArgument();
5957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005959 if (PyUnicode_READY(unicode) == -1)
5960 return NULL;
5961 /* Fast path: if it is a one-byte string, construct
5962 bytes object directly. */
5963 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
5964 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
5965 PyUnicode_GET_LENGTH(unicode));
5966 /* Non-Latin-1 characters present. Defer to above function to
5967 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005970 errors);
5971}
5972
5973PyObject*
5974PyUnicode_AsLatin1String(PyObject *unicode)
5975{
5976 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977}
5978
5979/* --- 7-bit ASCII Codec -------------------------------------------------- */
5980
Alexander Belopolsky40018472011-02-26 01:02:56 +00005981PyObject *
5982PyUnicode_DecodeASCII(const char *s,
5983 Py_ssize_t size,
5984 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005986 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 PyUnicodeObject *v;
5988 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005989 Py_ssize_t startinpos;
5990 Py_ssize_t endinpos;
5991 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005992 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005993 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 PyObject *errorHandler = NULL;
5995 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005996 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00005997
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005999 if (size == 1 && *(unsigned char*)s < 128)
6000 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6001
6002 /* Fast path. Assume the input actually *is* ASCII, and allocate
6003 a single-block Unicode object with that assumption. If there is
6004 an error, drop the object and start over. */
6005 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6006 if (v == NULL)
6007 goto onError;
6008 d = PyUnicode_1BYTE_DATA(v);
6009 for (i = 0; i < size; i++) {
6010 unsigned char ch = ((unsigned char*)s)[i];
6011 if (ch < 128)
6012 d[i] = ch;
6013 else
6014 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006016 if (i == size)
6017 return (PyObject*)v;
6018 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006019
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 v = _PyUnicode_New(size);
6021 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 e = s + size;
6027 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 register unsigned char c = (unsigned char)*s;
6029 if (c < 128) {
6030 *p++ = c;
6031 ++s;
6032 }
6033 else {
6034 startinpos = s-starts;
6035 endinpos = startinpos + 1;
6036 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6037 if (unicode_decode_call_errorhandler(
6038 errors, &errorHandler,
6039 "ascii", "ordinal not in range(128)",
6040 &starts, &e, &startinpos, &endinpos, &exc, &s,
6041 &v, &outpos, &p))
6042 goto onError;
6043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006045 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6047 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006048 Py_XDECREF(errorHandler);
6049 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006050 if (PyUnicode_READY(v) == -1) {
6051 Py_DECREF(v);
6052 return NULL;
6053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006055
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006058 Py_XDECREF(errorHandler);
6059 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 return NULL;
6061}
6062
Alexander Belopolsky40018472011-02-26 01:02:56 +00006063PyObject *
6064PyUnicode_EncodeASCII(const Py_UNICODE *p,
6065 Py_ssize_t size,
6066 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006068 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069}
6070
Alexander Belopolsky40018472011-02-26 01:02:56 +00006071PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006072_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073{
6074 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 PyErr_BadArgument();
6076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006078 if (PyUnicode_READY(unicode) == -1)
6079 return NULL;
6080 /* Fast path: if it is an ASCII-only string, construct bytes object
6081 directly. Else defer to above function to raise the exception. */
6082 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6083 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6084 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006087 errors);
6088}
6089
6090PyObject *
6091PyUnicode_AsASCIIString(PyObject *unicode)
6092{
6093 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094}
6095
Victor Stinner99b95382011-07-04 14:23:54 +02006096#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006097
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006098/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006099
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006100#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006101#define NEED_RETRY
6102#endif
6103
6104/* XXX This code is limited to "true" double-byte encodings, as
6105 a) it assumes an incomplete character consists of a single byte, and
6106 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006108
Alexander Belopolsky40018472011-02-26 01:02:56 +00006109static int
6110is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006111{
6112 const char *curr = s + offset;
6113
6114 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 const char *prev = CharPrev(s, curr);
6116 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006117 }
6118 return 0;
6119}
6120
6121/*
6122 * Decode MBCS string into unicode object. If 'final' is set, converts
6123 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6124 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006125static int
6126decode_mbcs(PyUnicodeObject **v,
6127 const char *s, /* MBCS string */
6128 int size, /* sizeof MBCS string */
6129 int final,
6130 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006131{
6132 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006133 Py_ssize_t n;
6134 DWORD usize;
6135 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006136
6137 assert(size >= 0);
6138
Victor Stinner554f3f02010-06-16 23:33:54 +00006139 /* check and handle 'errors' arg */
6140 if (errors==NULL || strcmp(errors, "strict")==0)
6141 flags = MB_ERR_INVALID_CHARS;
6142 else if (strcmp(errors, "ignore")==0)
6143 flags = 0;
6144 else {
6145 PyErr_Format(PyExc_ValueError,
6146 "mbcs encoding does not support errors='%s'",
6147 errors);
6148 return -1;
6149 }
6150
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006151 /* Skip trailing lead-byte unless 'final' is set */
6152 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006154
6155 /* First get the size of the result */
6156 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006157 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6158 if (usize==0)
6159 goto mbcs_decode_error;
6160 } else
6161 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006162
6163 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 /* Create unicode object */
6165 *v = _PyUnicode_New(usize);
6166 if (*v == NULL)
6167 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006168 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006169 }
6170 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 /* Extend unicode object */
6172 n = PyUnicode_GET_SIZE(*v);
6173 if (_PyUnicode_Resize(v, n + usize) < 0)
6174 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006175 }
6176
6177 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006178 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006180 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6181 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006183 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006184 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006185
6186mbcs_decode_error:
6187 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6188 we raise a UnicodeDecodeError - else it is a 'generic'
6189 windows error
6190 */
6191 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6192 /* Ideally, we should get reason from FormatMessage - this
6193 is the Windows 2000 English version of the message
6194 */
6195 PyObject *exc = NULL;
6196 const char *reason = "No mapping for the Unicode character exists "
6197 "in the target multi-byte code page.";
6198 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6199 if (exc != NULL) {
6200 PyCodec_StrictErrors(exc);
6201 Py_DECREF(exc);
6202 }
6203 } else {
6204 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6205 }
6206 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006207}
6208
Alexander Belopolsky40018472011-02-26 01:02:56 +00006209PyObject *
6210PyUnicode_DecodeMBCSStateful(const char *s,
6211 Py_ssize_t size,
6212 const char *errors,
6213 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006214{
6215 PyUnicodeObject *v = NULL;
6216 int done;
6217
6218 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006220
6221#ifdef NEED_RETRY
6222 retry:
6223 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006224 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006225 else
6226#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006227 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006228
6229 if (done < 0) {
6230 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006232 }
6233
6234 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006236
6237#ifdef NEED_RETRY
6238 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 s += done;
6240 size -= done;
6241 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006242 }
6243#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006244 if (PyUnicode_READY(v) == -1) {
6245 Py_DECREF(v);
6246 return NULL;
6247 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006248 return (PyObject *)v;
6249}
6250
Alexander Belopolsky40018472011-02-26 01:02:56 +00006251PyObject *
6252PyUnicode_DecodeMBCS(const char *s,
6253 Py_ssize_t size,
6254 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006255{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006256 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6257}
6258
6259/*
6260 * Convert unicode into string object (MBCS).
6261 * Returns 0 if succeed, -1 otherwise.
6262 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006263static int
6264encode_mbcs(PyObject **repr,
6265 const Py_UNICODE *p, /* unicode */
6266 int size, /* size of unicode */
6267 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006268{
Victor Stinner554f3f02010-06-16 23:33:54 +00006269 BOOL usedDefaultChar = FALSE;
6270 BOOL *pusedDefaultChar;
6271 int mbcssize;
6272 Py_ssize_t n;
6273 PyObject *exc = NULL;
6274 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006275
6276 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006277
Victor Stinner554f3f02010-06-16 23:33:54 +00006278 /* check and handle 'errors' arg */
6279 if (errors==NULL || strcmp(errors, "strict")==0) {
6280 flags = WC_NO_BEST_FIT_CHARS;
6281 pusedDefaultChar = &usedDefaultChar;
6282 } else if (strcmp(errors, "replace")==0) {
6283 flags = 0;
6284 pusedDefaultChar = NULL;
6285 } else {
6286 PyErr_Format(PyExc_ValueError,
6287 "mbcs encoding does not support errors='%s'",
6288 errors);
6289 return -1;
6290 }
6291
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006292 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006293 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006294 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6295 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 if (mbcssize == 0) {
6297 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6298 return -1;
6299 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006300 /* If we used a default char, then we failed! */
6301 if (pusedDefaultChar && *pusedDefaultChar)
6302 goto mbcs_encode_error;
6303 } else {
6304 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006305 }
6306
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006307 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 /* Create string object */
6309 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6310 if (*repr == NULL)
6311 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006312 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006313 }
6314 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 /* Extend string object */
6316 n = PyBytes_Size(*repr);
6317 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6318 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006319 }
6320
6321 /* Do the conversion */
6322 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006324 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6325 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6327 return -1;
6328 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006329 if (pusedDefaultChar && *pusedDefaultChar)
6330 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006331 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006332 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006333
6334mbcs_encode_error:
6335 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6336 Py_XDECREF(exc);
6337 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006338}
6339
Alexander Belopolsky40018472011-02-26 01:02:56 +00006340PyObject *
6341PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6342 Py_ssize_t size,
6343 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006344{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006345 PyObject *repr = NULL;
6346 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006347
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006348#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006350 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006351 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006352 else
6353#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006354 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006355
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006356 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 Py_XDECREF(repr);
6358 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006359 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006360
6361#ifdef NEED_RETRY
6362 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 p += INT_MAX;
6364 size -= INT_MAX;
6365 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006366 }
6367#endif
6368
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006369 return repr;
6370}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006371
Alexander Belopolsky40018472011-02-26 01:02:56 +00006372PyObject *
6373PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006374{
6375 if (!PyUnicode_Check(unicode)) {
6376 PyErr_BadArgument();
6377 return NULL;
6378 }
6379 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 PyUnicode_GET_SIZE(unicode),
6381 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006382}
6383
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006384#undef NEED_RETRY
6385
Victor Stinner99b95382011-07-04 14:23:54 +02006386#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006387
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388/* --- Character Mapping Codec -------------------------------------------- */
6389
Alexander Belopolsky40018472011-02-26 01:02:56 +00006390PyObject *
6391PyUnicode_DecodeCharmap(const char *s,
6392 Py_ssize_t size,
6393 PyObject *mapping,
6394 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006397 Py_ssize_t startinpos;
6398 Py_ssize_t endinpos;
6399 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 PyUnicodeObject *v;
6402 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006403 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 PyObject *errorHandler = NULL;
6405 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006406 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006407 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006408
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 /* Default to Latin-1 */
6410 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412
6413 v = _PyUnicode_New(size);
6414 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006420 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 mapstring = PyUnicode_AS_UNICODE(mapping);
6422 maplen = PyUnicode_GET_SIZE(mapping);
6423 while (s < e) {
6424 unsigned char ch = *s;
6425 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 if (ch < maplen)
6428 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 if (x == 0xfffe) {
6431 /* undefined mapping */
6432 outpos = p-PyUnicode_AS_UNICODE(v);
6433 startinpos = s-starts;
6434 endinpos = startinpos+1;
6435 if (unicode_decode_call_errorhandler(
6436 errors, &errorHandler,
6437 "charmap", "character maps to <undefined>",
6438 &starts, &e, &startinpos, &endinpos, &exc, &s,
6439 &v, &outpos, &p)) {
6440 goto onError;
6441 }
6442 continue;
6443 }
6444 *p++ = x;
6445 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006446 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006447 }
6448 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 while (s < e) {
6450 unsigned char ch = *s;
6451 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006452
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6454 w = PyLong_FromLong((long)ch);
6455 if (w == NULL)
6456 goto onError;
6457 x = PyObject_GetItem(mapping, w);
6458 Py_DECREF(w);
6459 if (x == NULL) {
6460 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6461 /* No mapping found means: mapping is undefined. */
6462 PyErr_Clear();
6463 x = Py_None;
6464 Py_INCREF(x);
6465 } else
6466 goto onError;
6467 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006468
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 /* Apply mapping */
6470 if (PyLong_Check(x)) {
6471 long value = PyLong_AS_LONG(x);
6472 if (value < 0 || value > 65535) {
6473 PyErr_SetString(PyExc_TypeError,
6474 "character mapping must be in range(65536)");
6475 Py_DECREF(x);
6476 goto onError;
6477 }
6478 *p++ = (Py_UNICODE)value;
6479 }
6480 else if (x == Py_None) {
6481 /* undefined mapping */
6482 outpos = p-PyUnicode_AS_UNICODE(v);
6483 startinpos = s-starts;
6484 endinpos = startinpos+1;
6485 if (unicode_decode_call_errorhandler(
6486 errors, &errorHandler,
6487 "charmap", "character maps to <undefined>",
6488 &starts, &e, &startinpos, &endinpos, &exc, &s,
6489 &v, &outpos, &p)) {
6490 Py_DECREF(x);
6491 goto onError;
6492 }
6493 Py_DECREF(x);
6494 continue;
6495 }
6496 else if (PyUnicode_Check(x)) {
6497 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006498
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 if (targetsize == 1)
6500 /* 1-1 mapping */
6501 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006502
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 else if (targetsize > 1) {
6504 /* 1-n mapping */
6505 if (targetsize > extrachars) {
6506 /* resize first */
6507 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6508 Py_ssize_t needed = (targetsize - extrachars) + \
6509 (targetsize << 2);
6510 extrachars += needed;
6511 /* XXX overflow detection missing */
6512 if (_PyUnicode_Resize(&v,
6513 PyUnicode_GET_SIZE(v) + needed) < 0) {
6514 Py_DECREF(x);
6515 goto onError;
6516 }
6517 p = PyUnicode_AS_UNICODE(v) + oldpos;
6518 }
6519 Py_UNICODE_COPY(p,
6520 PyUnicode_AS_UNICODE(x),
6521 targetsize);
6522 p += targetsize;
6523 extrachars -= targetsize;
6524 }
6525 /* 1-0 mapping: skip the character */
6526 }
6527 else {
6528 /* wrong return value */
6529 PyErr_SetString(PyExc_TypeError,
6530 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006531 Py_DECREF(x);
6532 goto onError;
6533 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 Py_DECREF(x);
6535 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 }
6538 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6540 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006541 Py_XDECREF(errorHandler);
6542 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006543 if (PyUnicode_READY(v) == -1) {
6544 Py_DECREF(v);
6545 return NULL;
6546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006548
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 Py_XDECREF(errorHandler);
6551 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 Py_XDECREF(v);
6553 return NULL;
6554}
6555
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006556/* Charmap encoding: the lookup table */
6557
Alexander Belopolsky40018472011-02-26 01:02:56 +00006558struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 PyObject_HEAD
6560 unsigned char level1[32];
6561 int count2, count3;
6562 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006563};
6564
6565static PyObject*
6566encoding_map_size(PyObject *obj, PyObject* args)
6567{
6568 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006569 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006571}
6572
6573static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006574 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 PyDoc_STR("Return the size (in bytes) of this object") },
6576 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006577};
6578
6579static void
6580encoding_map_dealloc(PyObject* o)
6581{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006582 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006583}
6584
6585static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006586 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 "EncodingMap", /*tp_name*/
6588 sizeof(struct encoding_map), /*tp_basicsize*/
6589 0, /*tp_itemsize*/
6590 /* methods */
6591 encoding_map_dealloc, /*tp_dealloc*/
6592 0, /*tp_print*/
6593 0, /*tp_getattr*/
6594 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006595 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 0, /*tp_repr*/
6597 0, /*tp_as_number*/
6598 0, /*tp_as_sequence*/
6599 0, /*tp_as_mapping*/
6600 0, /*tp_hash*/
6601 0, /*tp_call*/
6602 0, /*tp_str*/
6603 0, /*tp_getattro*/
6604 0, /*tp_setattro*/
6605 0, /*tp_as_buffer*/
6606 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6607 0, /*tp_doc*/
6608 0, /*tp_traverse*/
6609 0, /*tp_clear*/
6610 0, /*tp_richcompare*/
6611 0, /*tp_weaklistoffset*/
6612 0, /*tp_iter*/
6613 0, /*tp_iternext*/
6614 encoding_map_methods, /*tp_methods*/
6615 0, /*tp_members*/
6616 0, /*tp_getset*/
6617 0, /*tp_base*/
6618 0, /*tp_dict*/
6619 0, /*tp_descr_get*/
6620 0, /*tp_descr_set*/
6621 0, /*tp_dictoffset*/
6622 0, /*tp_init*/
6623 0, /*tp_alloc*/
6624 0, /*tp_new*/
6625 0, /*tp_free*/
6626 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006627};
6628
6629PyObject*
6630PyUnicode_BuildEncodingMap(PyObject* string)
6631{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006632 PyObject *result;
6633 struct encoding_map *mresult;
6634 int i;
6635 int need_dict = 0;
6636 unsigned char level1[32];
6637 unsigned char level2[512];
6638 unsigned char *mlevel1, *mlevel2, *mlevel3;
6639 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006640 int kind;
6641 void *data;
6642 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006644 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006645 PyErr_BadArgument();
6646 return NULL;
6647 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006648 kind = PyUnicode_KIND(string);
6649 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006650 memset(level1, 0xFF, sizeof level1);
6651 memset(level2, 0xFF, sizeof level2);
6652
6653 /* If there isn't a one-to-one mapping of NULL to \0,
6654 or if there are non-BMP characters, we need to use
6655 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006656 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006657 need_dict = 1;
6658 for (i = 1; i < 256; i++) {
6659 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006660 ch = PyUnicode_READ(kind, data, i);
6661 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006662 need_dict = 1;
6663 break;
6664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006665 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006666 /* unmapped character */
6667 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006668 l1 = ch >> 11;
6669 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006670 if (level1[l1] == 0xFF)
6671 level1[l1] = count2++;
6672 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006673 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006674 }
6675
6676 if (count2 >= 0xFF || count3 >= 0xFF)
6677 need_dict = 1;
6678
6679 if (need_dict) {
6680 PyObject *result = PyDict_New();
6681 PyObject *key, *value;
6682 if (!result)
6683 return NULL;
6684 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006685 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006686 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006687 if (!key || !value)
6688 goto failed1;
6689 if (PyDict_SetItem(result, key, value) == -1)
6690 goto failed1;
6691 Py_DECREF(key);
6692 Py_DECREF(value);
6693 }
6694 return result;
6695 failed1:
6696 Py_XDECREF(key);
6697 Py_XDECREF(value);
6698 Py_DECREF(result);
6699 return NULL;
6700 }
6701
6702 /* Create a three-level trie */
6703 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6704 16*count2 + 128*count3 - 1);
6705 if (!result)
6706 return PyErr_NoMemory();
6707 PyObject_Init(result, &EncodingMapType);
6708 mresult = (struct encoding_map*)result;
6709 mresult->count2 = count2;
6710 mresult->count3 = count3;
6711 mlevel1 = mresult->level1;
6712 mlevel2 = mresult->level23;
6713 mlevel3 = mresult->level23 + 16*count2;
6714 memcpy(mlevel1, level1, 32);
6715 memset(mlevel2, 0xFF, 16*count2);
6716 memset(mlevel3, 0, 128*count3);
6717 count3 = 0;
6718 for (i = 1; i < 256; i++) {
6719 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006720 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006721 /* unmapped character */
6722 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006723 o1 = PyUnicode_READ(kind, data, i)>>11;
6724 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006725 i2 = 16*mlevel1[o1] + o2;
6726 if (mlevel2[i2] == 0xFF)
6727 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006728 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006729 i3 = 128*mlevel2[i2] + o3;
6730 mlevel3[i3] = i;
6731 }
6732 return result;
6733}
6734
6735static int
6736encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6737{
6738 struct encoding_map *map = (struct encoding_map*)mapping;
6739 int l1 = c>>11;
6740 int l2 = (c>>7) & 0xF;
6741 int l3 = c & 0x7F;
6742 int i;
6743
6744#ifdef Py_UNICODE_WIDE
6745 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006747 }
6748#endif
6749 if (c == 0)
6750 return 0;
6751 /* level 1*/
6752 i = map->level1[l1];
6753 if (i == 0xFF) {
6754 return -1;
6755 }
6756 /* level 2*/
6757 i = map->level23[16*i+l2];
6758 if (i == 0xFF) {
6759 return -1;
6760 }
6761 /* level 3 */
6762 i = map->level23[16*map->count2 + 128*i + l3];
6763 if (i == 0) {
6764 return -1;
6765 }
6766 return i;
6767}
6768
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006769/* Lookup the character ch in the mapping. If the character
6770 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006771 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006772static PyObject *
6773charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774{
Christian Heimes217cfd12007-12-02 14:31:20 +00006775 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006776 PyObject *x;
6777
6778 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006780 x = PyObject_GetItem(mapping, w);
6781 Py_DECREF(w);
6782 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6784 /* No mapping found means: mapping is undefined. */
6785 PyErr_Clear();
6786 x = Py_None;
6787 Py_INCREF(x);
6788 return x;
6789 } else
6790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006792 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006793 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006794 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 long value = PyLong_AS_LONG(x);
6796 if (value < 0 || value > 255) {
6797 PyErr_SetString(PyExc_TypeError,
6798 "character mapping must be in range(256)");
6799 Py_DECREF(x);
6800 return NULL;
6801 }
6802 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006804 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 /* wrong return value */
6808 PyErr_Format(PyExc_TypeError,
6809 "character mapping must return integer, bytes or None, not %.400s",
6810 x->ob_type->tp_name);
6811 Py_DECREF(x);
6812 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 }
6814}
6815
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006816static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006817charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006818{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006819 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6820 /* exponentially overallocate to minimize reallocations */
6821 if (requiredsize < 2*outsize)
6822 requiredsize = 2*outsize;
6823 if (_PyBytes_Resize(outobj, requiredsize))
6824 return -1;
6825 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006826}
6827
Benjamin Peterson14339b62009-01-31 16:36:08 +00006828typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006830} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006831/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006832 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006833 space is available. Return a new reference to the object that
6834 was put in the output buffer, or Py_None, if the mapping was undefined
6835 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006836 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006837static charmapencode_result
6838charmapencode_output(Py_UNICODE c, PyObject *mapping,
6839 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006840{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006841 PyObject *rep;
6842 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006843 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844
Christian Heimes90aa7642007-12-19 02:45:37 +00006845 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006846 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006848 if (res == -1)
6849 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006850 if (outsize<requiredsize)
6851 if (charmapencode_resize(outobj, outpos, requiredsize))
6852 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006853 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 outstart[(*outpos)++] = (char)res;
6855 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006856 }
6857
6858 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006859 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006861 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 Py_DECREF(rep);
6863 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006864 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 if (PyLong_Check(rep)) {
6866 Py_ssize_t requiredsize = *outpos+1;
6867 if (outsize<requiredsize)
6868 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6869 Py_DECREF(rep);
6870 return enc_EXCEPTION;
6871 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006872 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006874 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 else {
6876 const char *repchars = PyBytes_AS_STRING(rep);
6877 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6878 Py_ssize_t requiredsize = *outpos+repsize;
6879 if (outsize<requiredsize)
6880 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6881 Py_DECREF(rep);
6882 return enc_EXCEPTION;
6883 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006884 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 memcpy(outstart + *outpos, repchars, repsize);
6886 *outpos += repsize;
6887 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006888 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006889 Py_DECREF(rep);
6890 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006891}
6892
6893/* handle an error in PyUnicode_EncodeCharmap
6894 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006895static int
6896charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006897 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006899 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006900 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006901{
6902 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006903 Py_ssize_t repsize;
6904 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006905 Py_UNICODE *uni2;
6906 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006907 Py_ssize_t collstartpos = *inpos;
6908 Py_ssize_t collendpos = *inpos+1;
6909 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006910 char *encoding = "charmap";
6911 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006912 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006914 /* find all unencodable characters */
6915 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006916 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006917 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 int res = encoding_map_lookup(p[collendpos], mapping);
6919 if (res != -1)
6920 break;
6921 ++collendpos;
6922 continue;
6923 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006924
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 rep = charmapencode_lookup(p[collendpos], mapping);
6926 if (rep==NULL)
6927 return -1;
6928 else if (rep!=Py_None) {
6929 Py_DECREF(rep);
6930 break;
6931 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006932 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006934 }
6935 /* cache callback name lookup
6936 * (if not done yet, i.e. it's the first error) */
6937 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 if ((errors==NULL) || (!strcmp(errors, "strict")))
6939 *known_errorHandler = 1;
6940 else if (!strcmp(errors, "replace"))
6941 *known_errorHandler = 2;
6942 else if (!strcmp(errors, "ignore"))
6943 *known_errorHandler = 3;
6944 else if (!strcmp(errors, "xmlcharrefreplace"))
6945 *known_errorHandler = 4;
6946 else
6947 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006948 }
6949 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006950 case 1: /* strict */
6951 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6952 return -1;
6953 case 2: /* replace */
6954 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006955 x = charmapencode_output('?', mapping, res, respos);
6956 if (x==enc_EXCEPTION) {
6957 return -1;
6958 }
6959 else if (x==enc_FAILED) {
6960 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6961 return -1;
6962 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006963 }
6964 /* fall through */
6965 case 3: /* ignore */
6966 *inpos = collendpos;
6967 break;
6968 case 4: /* xmlcharrefreplace */
6969 /* generate replacement (temporarily (mis)uses p) */
6970 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 char buffer[2+29+1+1];
6972 char *cp;
6973 sprintf(buffer, "&#%d;", (int)p[collpos]);
6974 for (cp = buffer; *cp; ++cp) {
6975 x = charmapencode_output(*cp, mapping, res, respos);
6976 if (x==enc_EXCEPTION)
6977 return -1;
6978 else if (x==enc_FAILED) {
6979 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6980 return -1;
6981 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006982 }
6983 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006984 *inpos = collendpos;
6985 break;
6986 default:
6987 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 encoding, reason, p, size, exceptionObject,
6989 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006990 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006992 if (PyBytes_Check(repunicode)) {
6993 /* Directly copy bytes result to output. */
6994 Py_ssize_t outsize = PyBytes_Size(*res);
6995 Py_ssize_t requiredsize;
6996 repsize = PyBytes_Size(repunicode);
6997 requiredsize = *respos + repsize;
6998 if (requiredsize > outsize)
6999 /* Make room for all additional bytes. */
7000 if (charmapencode_resize(res, respos, requiredsize)) {
7001 Py_DECREF(repunicode);
7002 return -1;
7003 }
7004 memcpy(PyBytes_AsString(*res) + *respos,
7005 PyBytes_AsString(repunicode), repsize);
7006 *respos += repsize;
7007 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007008 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007009 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007010 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007011 /* generate replacement */
7012 repsize = PyUnicode_GET_SIZE(repunicode);
7013 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 x = charmapencode_output(*uni2, mapping, res, respos);
7015 if (x==enc_EXCEPTION) {
7016 return -1;
7017 }
7018 else if (x==enc_FAILED) {
7019 Py_DECREF(repunicode);
7020 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7021 return -1;
7022 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007023 }
7024 *inpos = newpos;
7025 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007026 }
7027 return 0;
7028}
7029
Alexander Belopolsky40018472011-02-26 01:02:56 +00007030PyObject *
7031PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7032 Py_ssize_t size,
7033 PyObject *mapping,
7034 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007036 /* output object */
7037 PyObject *res = NULL;
7038 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007039 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007040 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007041 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007042 PyObject *errorHandler = NULL;
7043 PyObject *exc = NULL;
7044 /* the following variable is used for caching string comparisons
7045 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7046 * 3=ignore, 4=xmlcharrefreplace */
7047 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048
7049 /* Default to Latin-1 */
7050 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007053 /* allocate enough for a simple encoding without
7054 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007055 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007056 if (res == NULL)
7057 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007058 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007061 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 /* try to encode it */
7063 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7064 if (x==enc_EXCEPTION) /* error */
7065 goto onError;
7066 if (x==enc_FAILED) { /* unencodable character */
7067 if (charmap_encoding_error(p, size, &inpos, mapping,
7068 &exc,
7069 &known_errorHandler, &errorHandler, errors,
7070 &res, &respos)) {
7071 goto onError;
7072 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007073 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007074 else
7075 /* done with this character => adjust input position */
7076 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007079 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007080 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007081 if (_PyBytes_Resize(&res, respos) < 0)
7082 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007084 Py_XDECREF(exc);
7085 Py_XDECREF(errorHandler);
7086 return res;
7087
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007089 Py_XDECREF(res);
7090 Py_XDECREF(exc);
7091 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092 return NULL;
7093}
7094
Alexander Belopolsky40018472011-02-26 01:02:56 +00007095PyObject *
7096PyUnicode_AsCharmapString(PyObject *unicode,
7097 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098{
7099 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 PyErr_BadArgument();
7101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 }
7103 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 PyUnicode_GET_SIZE(unicode),
7105 mapping,
7106 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107}
7108
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007109/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007110static void
7111make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007112 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007113 Py_ssize_t startpos, Py_ssize_t endpos,
7114 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007116 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007117 *exceptionObject = _PyUnicodeTranslateError_Create(
7118 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119 }
7120 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7122 goto onError;
7123 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7124 goto onError;
7125 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7126 goto onError;
7127 return;
7128 onError:
7129 Py_DECREF(*exceptionObject);
7130 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 }
7132}
7133
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007134/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007135static void
7136raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007137 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007138 Py_ssize_t startpos, Py_ssize_t endpos,
7139 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007140{
7141 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007142 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007143 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007144 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007145}
7146
7147/* error handling callback helper:
7148 build arguments, call the callback and check the arguments,
7149 put the result into newpos and return the replacement string, which
7150 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007151static PyObject *
7152unicode_translate_call_errorhandler(const char *errors,
7153 PyObject **errorHandler,
7154 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007155 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007156 Py_ssize_t startpos, Py_ssize_t endpos,
7157 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007158{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007159 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007160
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007161 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007162 PyObject *restuple;
7163 PyObject *resunicode;
7164
7165 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007167 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007169 }
7170
7171 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007172 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007173 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007175
7176 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007178 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007179 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007180 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007181 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 Py_DECREF(restuple);
7183 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007184 }
7185 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 &resunicode, &i_newpos)) {
7187 Py_DECREF(restuple);
7188 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007189 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007190 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007191 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007192 else
7193 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007194 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7196 Py_DECREF(restuple);
7197 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007198 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199 Py_INCREF(resunicode);
7200 Py_DECREF(restuple);
7201 return resunicode;
7202}
7203
7204/* Lookup the character ch in the mapping and put the result in result,
7205 which must be decrefed by the caller.
7206 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007207static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007208charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007209{
Christian Heimes217cfd12007-12-02 14:31:20 +00007210 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007211 PyObject *x;
7212
7213 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007215 x = PyObject_GetItem(mapping, w);
7216 Py_DECREF(w);
7217 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7219 /* No mapping found means: use 1:1 mapping. */
7220 PyErr_Clear();
7221 *result = NULL;
7222 return 0;
7223 } else
7224 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007225 }
7226 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 *result = x;
7228 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007229 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007230 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 long value = PyLong_AS_LONG(x);
7232 long max = PyUnicode_GetMax();
7233 if (value < 0 || value > max) {
7234 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007235 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 Py_DECREF(x);
7237 return -1;
7238 }
7239 *result = x;
7240 return 0;
7241 }
7242 else if (PyUnicode_Check(x)) {
7243 *result = x;
7244 return 0;
7245 }
7246 else {
7247 /* wrong return value */
7248 PyErr_SetString(PyExc_TypeError,
7249 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007250 Py_DECREF(x);
7251 return -1;
7252 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007253}
7254/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 if not reallocate and adjust various state variables.
7256 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007257static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007258charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007260{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007261 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007262 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007263 /* exponentially overallocate to minimize reallocations */
7264 if (requiredsize < 2 * oldsize)
7265 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007266 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7267 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007269 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007270 }
7271 return 0;
7272}
7273/* lookup the character, put the result in the output string and adjust
7274 various state variables. Return a new reference to the object that
7275 was put in the output buffer in *result, or Py_None, if the mapping was
7276 undefined (in which case no character was written).
7277 The called must decref result.
7278 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007279static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007280charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7281 PyObject *mapping, Py_UCS4 **output,
7282 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007283 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007285 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7286 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007288 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007289 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007290 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007291 }
7292 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007294 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007296 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007297 }
7298 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007299 Py_ssize_t repsize;
7300 if (PyUnicode_READY(*res) == -1)
7301 return -1;
7302 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007303 if (repsize==1) {
7304 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007305 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 }
7307 else if (repsize!=0) {
7308 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007309 Py_ssize_t requiredsize = *opos +
7310 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007312 Py_ssize_t i;
7313 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007315 for(i = 0; i < repsize; i++)
7316 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007318 }
7319 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007321 return 0;
7322}
7323
Alexander Belopolsky40018472011-02-26 01:02:56 +00007324PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007325_PyUnicode_TranslateCharmap(PyObject *input,
7326 PyObject *mapping,
7327 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007329 /* input object */
7330 char *idata;
7331 Py_ssize_t size, i;
7332 int kind;
7333 /* output buffer */
7334 Py_UCS4 *output = NULL;
7335 Py_ssize_t osize;
7336 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007337 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007338 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007339 char *reason = "character maps to <undefined>";
7340 PyObject *errorHandler = NULL;
7341 PyObject *exc = NULL;
7342 /* the following variable is used for caching string comparisons
7343 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7344 * 3=ignore, 4=xmlcharrefreplace */
7345 int known_errorHandler = -1;
7346
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 PyErr_BadArgument();
7349 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007352 if (PyUnicode_READY(input) == -1)
7353 return NULL;
7354 idata = (char*)PyUnicode_DATA(input);
7355 kind = PyUnicode_KIND(input);
7356 size = PyUnicode_GET_LENGTH(input);
7357 i = 0;
7358
7359 if (size == 0) {
7360 Py_INCREF(input);
7361 return input;
7362 }
7363
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007364 /* allocate enough for a simple 1:1 translation without
7365 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007366 osize = size;
7367 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7368 opos = 0;
7369 if (output == NULL) {
7370 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007374 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 /* try to encode it */
7376 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007377 if (charmaptranslate_output(input, i, mapping,
7378 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 Py_XDECREF(x);
7380 goto onError;
7381 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007382 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007384 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 else { /* untranslatable character */
7386 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7387 Py_ssize_t repsize;
7388 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007389 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007391 Py_ssize_t collstart = i;
7392 Py_ssize_t collend = i+1;
7393 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394
Benjamin Peterson29060642009-01-31 22:14:21 +00007395 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007396 while (collend < size) {
7397 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 goto onError;
7399 Py_XDECREF(x);
7400 if (x!=Py_None)
7401 break;
7402 ++collend;
7403 }
7404 /* cache callback name lookup
7405 * (if not done yet, i.e. it's the first error) */
7406 if (known_errorHandler==-1) {
7407 if ((errors==NULL) || (!strcmp(errors, "strict")))
7408 known_errorHandler = 1;
7409 else if (!strcmp(errors, "replace"))
7410 known_errorHandler = 2;
7411 else if (!strcmp(errors, "ignore"))
7412 known_errorHandler = 3;
7413 else if (!strcmp(errors, "xmlcharrefreplace"))
7414 known_errorHandler = 4;
7415 else
7416 known_errorHandler = 0;
7417 }
7418 switch (known_errorHandler) {
7419 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007420 raise_translate_exception(&exc, input, collstart,
7421 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007422 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 case 2: /* replace */
7424 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007425 for (coll = collstart; coll<collend; coll++)
7426 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 /* fall through */
7428 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007429 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007430 break;
7431 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007432 /* generate replacement (temporarily (mis)uses i) */
7433 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 char buffer[2+29+1+1];
7435 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007436 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7437 if (charmaptranslate_makespace(&output, &osize,
7438 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 goto onError;
7440 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007441 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007443 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 break;
7445 default:
7446 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007447 reason, input, &exc,
7448 collstart, collend, &newpos);
7449 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 goto onError;
7451 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007452 repsize = PyUnicode_GET_LENGTH(repunicode);
7453 if (charmaptranslate_makespace(&output, &osize,
7454 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 Py_DECREF(repunicode);
7456 goto onError;
7457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007458 for (uni2 = 0; repsize-->0; ++uni2)
7459 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7460 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007462 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007463 }
7464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007465 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7466 if (!res)
7467 goto onError;
7468 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007469 Py_XDECREF(exc);
7470 Py_XDECREF(errorHandler);
7471 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007474 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007475 Py_XDECREF(exc);
7476 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477 return NULL;
7478}
7479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007480/* Deprecated. Use PyUnicode_Translate instead. */
7481PyObject *
7482PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7483 Py_ssize_t size,
7484 PyObject *mapping,
7485 const char *errors)
7486{
7487 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7488 if (!unicode)
7489 return NULL;
7490 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7491}
7492
Alexander Belopolsky40018472011-02-26 01:02:56 +00007493PyObject *
7494PyUnicode_Translate(PyObject *str,
7495 PyObject *mapping,
7496 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497{
7498 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007499
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 str = PyUnicode_FromObject(str);
7501 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007503 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 Py_DECREF(str);
7505 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007506
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 Py_XDECREF(str);
7509 return NULL;
7510}
Tim Petersced69f82003-09-16 20:30:58 +00007511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007512static Py_UCS4
7513fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7514{
7515 /* No need to call PyUnicode_READY(self) because this function is only
7516 called as a callback from fixup() which does it already. */
7517 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7518 const int kind = PyUnicode_KIND(self);
7519 void *data = PyUnicode_DATA(self);
7520 Py_UCS4 maxchar = 0, ch, fixed;
7521 Py_ssize_t i;
7522
7523 for (i = 0; i < len; ++i) {
7524 ch = PyUnicode_READ(kind, data, i);
7525 fixed = 0;
7526 if (ch > 127) {
7527 if (Py_UNICODE_ISSPACE(ch))
7528 fixed = ' ';
7529 else {
7530 const int decimal = Py_UNICODE_TODECIMAL(ch);
7531 if (decimal >= 0)
7532 fixed = '0' + decimal;
7533 }
7534 if (fixed != 0) {
7535 if (fixed > maxchar)
7536 maxchar = fixed;
7537 PyUnicode_WRITE(kind, data, i, fixed);
7538 }
7539 else if (ch > maxchar)
7540 maxchar = ch;
7541 }
7542 else if (ch > maxchar)
7543 maxchar = ch;
7544 }
7545
7546 return maxchar;
7547}
7548
7549PyObject *
7550_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7551{
7552 if (!PyUnicode_Check(unicode)) {
7553 PyErr_BadInternalCall();
7554 return NULL;
7555 }
7556 if (PyUnicode_READY(unicode) == -1)
7557 return NULL;
7558 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7559 /* If the string is already ASCII, just return the same string */
7560 Py_INCREF(unicode);
7561 return unicode;
7562 }
7563 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7564}
7565
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007566PyObject *
7567PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7568 Py_ssize_t length)
7569{
7570 PyObject *result;
7571 Py_UNICODE *p; /* write pointer into result */
7572 Py_ssize_t i;
7573 /* Copy to a new string */
7574 result = (PyObject *)_PyUnicode_New(length);
7575 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7576 if (result == NULL)
7577 return result;
7578 p = PyUnicode_AS_UNICODE(result);
7579 /* Iterate over code points */
7580 for (i = 0; i < length; i++) {
7581 Py_UNICODE ch =s[i];
7582 if (ch > 127) {
7583 int decimal = Py_UNICODE_TODECIMAL(ch);
7584 if (decimal >= 0)
7585 p[i] = '0' + decimal;
7586 }
7587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007588 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7589 Py_DECREF(result);
7590 return NULL;
7591 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007592 return result;
7593}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007594/* --- Decimal Encoder ---------------------------------------------------- */
7595
Alexander Belopolsky40018472011-02-26 01:02:56 +00007596int
7597PyUnicode_EncodeDecimal(Py_UNICODE *s,
7598 Py_ssize_t length,
7599 char *output,
7600 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007601{
7602 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007603 PyObject *errorHandler = NULL;
7604 PyObject *exc = NULL;
7605 const char *encoding = "decimal";
7606 const char *reason = "invalid decimal Unicode string";
7607 /* the following variable is used for caching string comparisons
7608 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7609 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007610
7611 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 PyErr_BadArgument();
7613 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007614 }
7615
7616 p = s;
7617 end = s + length;
7618 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 register Py_UNICODE ch = *p;
7620 int decimal;
7621 PyObject *repunicode;
7622 Py_ssize_t repsize;
7623 Py_ssize_t newpos;
7624 Py_UNICODE *uni2;
7625 Py_UNICODE *collstart;
7626 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007627
Benjamin Peterson29060642009-01-31 22:14:21 +00007628 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007629 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 ++p;
7631 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007632 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 decimal = Py_UNICODE_TODECIMAL(ch);
7634 if (decimal >= 0) {
7635 *output++ = '0' + decimal;
7636 ++p;
7637 continue;
7638 }
7639 if (0 < ch && ch < 256) {
7640 *output++ = (char)ch;
7641 ++p;
7642 continue;
7643 }
7644 /* All other characters are considered unencodable */
7645 collstart = p;
7646 collend = p+1;
7647 while (collend < end) {
7648 if ((0 < *collend && *collend < 256) ||
7649 !Py_UNICODE_ISSPACE(*collend) ||
7650 Py_UNICODE_TODECIMAL(*collend))
7651 break;
7652 }
7653 /* cache callback name lookup
7654 * (if not done yet, i.e. it's the first error) */
7655 if (known_errorHandler==-1) {
7656 if ((errors==NULL) || (!strcmp(errors, "strict")))
7657 known_errorHandler = 1;
7658 else if (!strcmp(errors, "replace"))
7659 known_errorHandler = 2;
7660 else if (!strcmp(errors, "ignore"))
7661 known_errorHandler = 3;
7662 else if (!strcmp(errors, "xmlcharrefreplace"))
7663 known_errorHandler = 4;
7664 else
7665 known_errorHandler = 0;
7666 }
7667 switch (known_errorHandler) {
7668 case 1: /* strict */
7669 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7670 goto onError;
7671 case 2: /* replace */
7672 for (p = collstart; p < collend; ++p)
7673 *output++ = '?';
7674 /* fall through */
7675 case 3: /* ignore */
7676 p = collend;
7677 break;
7678 case 4: /* xmlcharrefreplace */
7679 /* generate replacement (temporarily (mis)uses p) */
7680 for (p = collstart; p < collend; ++p)
7681 output += sprintf(output, "&#%d;", (int)*p);
7682 p = collend;
7683 break;
7684 default:
7685 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7686 encoding, reason, s, length, &exc,
7687 collstart-s, collend-s, &newpos);
7688 if (repunicode == NULL)
7689 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007690 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007691 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007692 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7693 Py_DECREF(repunicode);
7694 goto onError;
7695 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007696 /* generate replacement */
7697 repsize = PyUnicode_GET_SIZE(repunicode);
7698 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7699 Py_UNICODE ch = *uni2;
7700 if (Py_UNICODE_ISSPACE(ch))
7701 *output++ = ' ';
7702 else {
7703 decimal = Py_UNICODE_TODECIMAL(ch);
7704 if (decimal >= 0)
7705 *output++ = '0' + decimal;
7706 else if (0 < ch && ch < 256)
7707 *output++ = (char)ch;
7708 else {
7709 Py_DECREF(repunicode);
7710 raise_encode_exception(&exc, encoding,
7711 s, length, collstart-s, collend-s, reason);
7712 goto onError;
7713 }
7714 }
7715 }
7716 p = s + newpos;
7717 Py_DECREF(repunicode);
7718 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007719 }
7720 /* 0-terminate the output string */
7721 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007722 Py_XDECREF(exc);
7723 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007724 return 0;
7725
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007727 Py_XDECREF(exc);
7728 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007729 return -1;
7730}
7731
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732/* --- Helpers ------------------------------------------------------------ */
7733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007734#include "stringlib/ucs1lib.h"
7735#include "stringlib/fastsearch.h"
7736#include "stringlib/partition.h"
7737#include "stringlib/split.h"
7738#include "stringlib/count.h"
7739#include "stringlib/find.h"
7740#include "stringlib/localeutil.h"
7741#include "stringlib/undef.h"
7742
7743#include "stringlib/ucs2lib.h"
7744#include "stringlib/fastsearch.h"
7745#include "stringlib/partition.h"
7746#include "stringlib/split.h"
7747#include "stringlib/count.h"
7748#include "stringlib/find.h"
7749#include "stringlib/localeutil.h"
7750#include "stringlib/undef.h"
7751
7752#include "stringlib/ucs4lib.h"
7753#include "stringlib/fastsearch.h"
7754#include "stringlib/partition.h"
7755#include "stringlib/split.h"
7756#include "stringlib/count.h"
7757#include "stringlib/find.h"
7758#include "stringlib/localeutil.h"
7759#include "stringlib/undef.h"
7760
7761static Py_ssize_t
7762any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7763 const Py_UCS1*, Py_ssize_t,
7764 Py_ssize_t, Py_ssize_t),
7765 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7766 const Py_UCS2*, Py_ssize_t,
7767 Py_ssize_t, Py_ssize_t),
7768 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7769 const Py_UCS4*, Py_ssize_t,
7770 Py_ssize_t, Py_ssize_t),
7771 PyObject* s1, PyObject* s2,
7772 Py_ssize_t start,
7773 Py_ssize_t end)
7774{
7775 int kind1, kind2, kind;
7776 void *buf1, *buf2;
7777 Py_ssize_t len1, len2, result;
7778
7779 kind1 = PyUnicode_KIND(s1);
7780 kind2 = PyUnicode_KIND(s2);
7781 kind = kind1 > kind2 ? kind1 : kind2;
7782 buf1 = PyUnicode_DATA(s1);
7783 buf2 = PyUnicode_DATA(s2);
7784 if (kind1 != kind)
7785 buf1 = _PyUnicode_AsKind(s1, kind);
7786 if (!buf1)
7787 return -2;
7788 if (kind2 != kind)
7789 buf2 = _PyUnicode_AsKind(s2, kind);
7790 if (!buf2) {
7791 if (kind1 != kind) PyMem_Free(buf1);
7792 return -2;
7793 }
7794 len1 = PyUnicode_GET_LENGTH(s1);
7795 len2 = PyUnicode_GET_LENGTH(s2);
7796
7797 switch(kind) {
7798 case PyUnicode_1BYTE_KIND:
7799 result = ucs1(buf1, len1, buf2, len2, start, end);
7800 break;
7801 case PyUnicode_2BYTE_KIND:
7802 result = ucs2(buf1, len1, buf2, len2, start, end);
7803 break;
7804 case PyUnicode_4BYTE_KIND:
7805 result = ucs4(buf1, len1, buf2, len2, start, end);
7806 break;
7807 default:
7808 assert(0); result = -2;
7809 }
7810
7811 if (kind1 != kind)
7812 PyMem_Free(buf1);
7813 if (kind2 != kind)
7814 PyMem_Free(buf2);
7815
7816 return result;
7817}
7818
7819Py_ssize_t
7820_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7821 Py_ssize_t n_buffer,
7822 void *digits, Py_ssize_t n_digits,
7823 Py_ssize_t min_width,
7824 const char *grouping,
7825 const char *thousands_sep)
7826{
7827 switch(kind) {
7828 case PyUnicode_1BYTE_KIND:
7829 return _PyUnicode_ucs1_InsertThousandsGrouping(
7830 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7831 min_width, grouping, thousands_sep);
7832 case PyUnicode_2BYTE_KIND:
7833 return _PyUnicode_ucs2_InsertThousandsGrouping(
7834 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7835 min_width, grouping, thousands_sep);
7836 case PyUnicode_4BYTE_KIND:
7837 return _PyUnicode_ucs4_InsertThousandsGrouping(
7838 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7839 min_width, grouping, thousands_sep);
7840 }
7841 assert(0);
7842 return -1;
7843}
7844
7845
Eric Smith8c663262007-08-25 02:26:07 +00007846#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007847#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007848
Thomas Wouters477c8d52006-05-27 19:21:47 +00007849#include "stringlib/count.h"
7850#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007851
Thomas Wouters477c8d52006-05-27 19:21:47 +00007852/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007853#define ADJUST_INDICES(start, end, len) \
7854 if (end > len) \
7855 end = len; \
7856 else if (end < 0) { \
7857 end += len; \
7858 if (end < 0) \
7859 end = 0; \
7860 } \
7861 if (start < 0) { \
7862 start += len; \
7863 if (start < 0) \
7864 start = 0; \
7865 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007866
Alexander Belopolsky40018472011-02-26 01:02:56 +00007867Py_ssize_t
7868PyUnicode_Count(PyObject *str,
7869 PyObject *substr,
7870 Py_ssize_t start,
7871 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007873 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007874 PyUnicodeObject* str_obj;
7875 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007876 int kind1, kind2, kind;
7877 void *buf1 = NULL, *buf2 = NULL;
7878 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007879
Thomas Wouters477c8d52006-05-27 19:21:47 +00007880 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007883 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007884 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 Py_DECREF(str_obj);
7886 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 }
Tim Petersced69f82003-09-16 20:30:58 +00007888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007889 kind1 = PyUnicode_KIND(str_obj);
7890 kind2 = PyUnicode_KIND(sub_obj);
7891 kind = kind1 > kind2 ? kind1 : kind2;
7892 buf1 = PyUnicode_DATA(str_obj);
7893 if (kind1 != kind)
7894 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7895 if (!buf1)
7896 goto onError;
7897 buf2 = PyUnicode_DATA(sub_obj);
7898 if (kind2 != kind)
7899 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7900 if (!buf2)
7901 goto onError;
7902 len1 = PyUnicode_GET_LENGTH(str_obj);
7903 len2 = PyUnicode_GET_LENGTH(sub_obj);
7904
7905 ADJUST_INDICES(start, end, len1);
7906 switch(kind) {
7907 case PyUnicode_1BYTE_KIND:
7908 result = ucs1lib_count(
7909 ((Py_UCS1*)buf1) + start, end - start,
7910 buf2, len2, PY_SSIZE_T_MAX
7911 );
7912 break;
7913 case PyUnicode_2BYTE_KIND:
7914 result = ucs2lib_count(
7915 ((Py_UCS2*)buf1) + start, end - start,
7916 buf2, len2, PY_SSIZE_T_MAX
7917 );
7918 break;
7919 case PyUnicode_4BYTE_KIND:
7920 result = ucs4lib_count(
7921 ((Py_UCS4*)buf1) + start, end - start,
7922 buf2, len2, PY_SSIZE_T_MAX
7923 );
7924 break;
7925 default:
7926 assert(0); result = 0;
7927 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007928
7929 Py_DECREF(sub_obj);
7930 Py_DECREF(str_obj);
7931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007932 if (kind1 != kind)
7933 PyMem_Free(buf1);
7934 if (kind2 != kind)
7935 PyMem_Free(buf2);
7936
Guido van Rossumd57fd912000-03-10 22:53:23 +00007937 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007938 onError:
7939 Py_DECREF(sub_obj);
7940 Py_DECREF(str_obj);
7941 if (kind1 != kind && buf1)
7942 PyMem_Free(buf1);
7943 if (kind2 != kind && buf2)
7944 PyMem_Free(buf2);
7945 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946}
7947
Alexander Belopolsky40018472011-02-26 01:02:56 +00007948Py_ssize_t
7949PyUnicode_Find(PyObject *str,
7950 PyObject *sub,
7951 Py_ssize_t start,
7952 Py_ssize_t end,
7953 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007955 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00007956
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007958 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007960 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007961 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 Py_DECREF(str);
7963 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 }
Tim Petersced69f82003-09-16 20:30:58 +00007965
Thomas Wouters477c8d52006-05-27 19:21:47 +00007966 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007967 result = any_find_slice(
7968 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
7969 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007970 );
7971 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007972 result = any_find_slice(
7973 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
7974 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007975 );
7976
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007978 Py_DECREF(sub);
7979
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 return result;
7981}
7982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007983Py_ssize_t
7984PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
7985 Py_ssize_t start, Py_ssize_t end,
7986 int direction)
7987{
7988 char *result;
7989 int kind;
7990 if (PyUnicode_READY(str) == -1)
7991 return -2;
7992 if (end > PyUnicode_GET_LENGTH(str))
7993 end = PyUnicode_GET_LENGTH(str);
7994 kind = PyUnicode_KIND(str);
7995 result = findchar(PyUnicode_1BYTE_DATA(str)
7996 + PyUnicode_KIND_SIZE(kind, start),
7997 kind,
7998 end-start, ch, direction);
7999 if (!result)
8000 return -1;
8001 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8002}
8003
Alexander Belopolsky40018472011-02-26 01:02:56 +00008004static int
8005tailmatch(PyUnicodeObject *self,
8006 PyUnicodeObject *substring,
8007 Py_ssize_t start,
8008 Py_ssize_t end,
8009 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008011 int kind_self;
8012 int kind_sub;
8013 void *data_self;
8014 void *data_sub;
8015 Py_ssize_t offset;
8016 Py_ssize_t i;
8017 Py_ssize_t end_sub;
8018
8019 if (PyUnicode_READY(self) == -1 ||
8020 PyUnicode_READY(substring) == -1)
8021 return 0;
8022
8023 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 return 1;
8025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8027 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008031 kind_self = PyUnicode_KIND(self);
8032 data_self = PyUnicode_DATA(self);
8033 kind_sub = PyUnicode_KIND(substring);
8034 data_sub = PyUnicode_DATA(substring);
8035 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8036
8037 if (direction > 0)
8038 offset = end;
8039 else
8040 offset = start;
8041
8042 if (PyUnicode_READ(kind_self, data_self, offset) ==
8043 PyUnicode_READ(kind_sub, data_sub, 0) &&
8044 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8045 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8046 /* If both are of the same kind, memcmp is sufficient */
8047 if (kind_self == kind_sub) {
8048 return ! memcmp((char *)data_self +
8049 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8050 data_sub,
8051 PyUnicode_GET_LENGTH(substring) *
8052 PyUnicode_CHARACTER_SIZE(substring));
8053 }
8054 /* otherwise we have to compare each character by first accesing it */
8055 else {
8056 /* We do not need to compare 0 and len(substring)-1 because
8057 the if statement above ensured already that they are equal
8058 when we end up here. */
8059 // TODO: honor direction and do a forward or backwards search
8060 for (i = 1; i < end_sub; ++i) {
8061 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8062 PyUnicode_READ(kind_sub, data_sub, i))
8063 return 0;
8064 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067 }
8068
8069 return 0;
8070}
8071
Alexander Belopolsky40018472011-02-26 01:02:56 +00008072Py_ssize_t
8073PyUnicode_Tailmatch(PyObject *str,
8074 PyObject *substr,
8075 Py_ssize_t start,
8076 Py_ssize_t end,
8077 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008079 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008080
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 str = PyUnicode_FromObject(str);
8082 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084 substr = PyUnicode_FromObject(substr);
8085 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 Py_DECREF(str);
8087 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088 }
Tim Petersced69f82003-09-16 20:30:58 +00008089
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 (PyUnicodeObject *)substr,
8092 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093 Py_DECREF(str);
8094 Py_DECREF(substr);
8095 return result;
8096}
8097
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098/* Apply fixfct filter to the Unicode object self and return a
8099 reference to the modified object */
8100
Alexander Belopolsky40018472011-02-26 01:02:56 +00008101static PyObject *
8102fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008103 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008105 PyObject *u;
8106 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 if (PyUnicode_READY(self) == -1)
8109 return NULL;
8110 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8111 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8112 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008116 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8117 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 /* fix functions return the new maximum character in a string,
8120 if the kind of the resulting unicode object does not change,
8121 everything is fine. Otherwise we need to change the string kind
8122 and re-run the fix function. */
8123 maxchar_new = fixfct((PyUnicodeObject*)u);
8124 if (maxchar_new == 0)
8125 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8126 else if (maxchar_new <= 127)
8127 maxchar_new = 127;
8128 else if (maxchar_new <= 255)
8129 maxchar_new = 255;
8130 else if (maxchar_new <= 65535)
8131 maxchar_new = 65535;
8132 else
8133 maxchar_new = 1114111; /* 0x10ffff */
8134
8135 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 /* fixfct should return TRUE if it modified the buffer. If
8137 FALSE, return a reference to the original buffer instead
8138 (to save space, not time) */
8139 Py_INCREF(self);
8140 Py_DECREF(u);
8141 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008143 else if (maxchar_new == maxchar_old) {
8144 return u;
8145 }
8146 else {
8147 /* In case the maximum character changed, we need to
8148 convert the string to the new category. */
8149 PyObject *v = PyUnicode_New(
8150 PyUnicode_GET_LENGTH(self), maxchar_new);
8151 if (v == NULL) {
8152 Py_DECREF(u);
8153 return NULL;
8154 }
8155 if (maxchar_new > maxchar_old) {
8156 /* If the maxchar increased so that the kind changed, not all
8157 characters are representable anymore and we need to fix the
8158 string again. This only happens in very few cases. */
8159 PyUnicode_CopyCharacters(v, 0, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
8160 maxchar_old = fixfct((PyUnicodeObject*)v);
8161 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8162 }
8163 else
8164 PyUnicode_CopyCharacters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
8165
8166 Py_DECREF(u);
8167 return v;
8168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169}
8170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008171static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008172fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008174 /* No need to call PyUnicode_READY(self) because this function is only
8175 called as a callback from fixup() which does it already. */
8176 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8177 const int kind = PyUnicode_KIND(self);
8178 void *data = PyUnicode_DATA(self);
8179 int touched = 0;
8180 Py_UCS4 maxchar = 0;
8181 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008183 for (i = 0; i < len; ++i) {
8184 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8185 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8186 if (up != ch) {
8187 if (up > maxchar)
8188 maxchar = up;
8189 PyUnicode_WRITE(kind, data, i, up);
8190 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192 else if (ch > maxchar)
8193 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194 }
8195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008196 if (touched)
8197 return maxchar;
8198 else
8199 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200}
8201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008202static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008203fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008205 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8206 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8207 const int kind = PyUnicode_KIND(self);
8208 void *data = PyUnicode_DATA(self);
8209 int touched = 0;
8210 Py_UCS4 maxchar = 0;
8211 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008213 for(i = 0; i < len; ++i) {
8214 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8215 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8216 if (lo != ch) {
8217 if (lo > maxchar)
8218 maxchar = lo;
8219 PyUnicode_WRITE(kind, data, i, lo);
8220 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008222 else if (ch > maxchar)
8223 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 }
8225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008226 if (touched)
8227 return maxchar;
8228 else
8229 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230}
8231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008232static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008233fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8236 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8237 const int kind = PyUnicode_KIND(self);
8238 void *data = PyUnicode_DATA(self);
8239 int touched = 0;
8240 Py_UCS4 maxchar = 0;
8241 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008243 for(i = 0; i < len; ++i) {
8244 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8245 Py_UCS4 nu = 0;
8246
8247 if (Py_UNICODE_ISUPPER(ch))
8248 nu = Py_UNICODE_TOLOWER(ch);
8249 else if (Py_UNICODE_ISLOWER(ch))
8250 nu = Py_UNICODE_TOUPPER(ch);
8251
8252 if (nu != 0) {
8253 if (nu > maxchar)
8254 maxchar = nu;
8255 PyUnicode_WRITE(kind, data, i, nu);
8256 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 else if (ch > maxchar)
8259 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 }
8261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008262 if (touched)
8263 return maxchar;
8264 else
8265 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266}
8267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008268static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008269fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8272 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8273 const int kind = PyUnicode_KIND(self);
8274 void *data = PyUnicode_DATA(self);
8275 int touched = 0;
8276 Py_UCS4 maxchar = 0;
8277 Py_ssize_t i = 0;
8278 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008279
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008280 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008282
8283 ch = PyUnicode_READ(kind, data, i);
8284 if (!Py_UNICODE_ISUPPER(ch)) {
8285 maxchar = Py_UNICODE_TOUPPER(ch);
8286 PyUnicode_WRITE(kind, data, i, maxchar);
8287 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008289 ++i;
8290 for(; i < len; ++i) {
8291 ch = PyUnicode_READ(kind, data, i);
8292 if (!Py_UNICODE_ISLOWER(ch)) {
8293 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8294 if (lo > maxchar)
8295 maxchar = lo;
8296 PyUnicode_WRITE(kind, data, i, lo);
8297 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008298 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008299 else if (ch > maxchar)
8300 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008302
8303 if (touched)
8304 return maxchar;
8305 else
8306 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307}
8308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008309static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008310fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8313 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8314 const int kind = PyUnicode_KIND(self);
8315 void *data = PyUnicode_DATA(self);
8316 Py_UCS4 maxchar = 0;
8317 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318 int previous_is_cased;
8319
8320 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008321 if (len == 1) {
8322 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8323 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8324 if (ti != ch) {
8325 PyUnicode_WRITE(kind, data, i, ti);
8326 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 }
8328 else
8329 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332 for(; i < len; ++i) {
8333 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8334 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008335
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008337 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008339 nu = Py_UNICODE_TOTITLE(ch);
8340
8341 if (nu > maxchar)
8342 maxchar = nu;
8343 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008344
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 if (Py_UNICODE_ISLOWER(ch) ||
8346 Py_UNICODE_ISUPPER(ch) ||
8347 Py_UNICODE_ISTITLE(ch))
8348 previous_is_cased = 1;
8349 else
8350 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353}
8354
Tim Peters8ce9f162004-08-27 01:49:32 +00008355PyObject *
8356PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008358 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008359 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008361 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008362 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8363 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008364 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 Py_ssize_t sz, i, res_offset;
8366 Py_UCS4 maxchar = 0;
8367 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368
Tim Peters05eba1f2004-08-27 21:32:02 +00008369 fseq = PySequence_Fast(seq, "");
8370 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008371 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008372 }
8373
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008374 /* NOTE: the following code can't call back into Python code,
8375 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008376 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008377
Tim Peters05eba1f2004-08-27 21:32:02 +00008378 seqlen = PySequence_Fast_GET_SIZE(fseq);
8379 /* If empty sequence, return u"". */
8380 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008382 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008383 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008384 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008385 /* If singleton sequence with an exact Unicode, return that. */
8386 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 item = items[0];
8388 if (PyUnicode_CheckExact(item)) {
8389 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008390 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 goto Done;
8392 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008393 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008394 else {
8395 /* Set up sep and seplen */
8396 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 /* fall back to a blank space separator */
8398 sep = PyUnicode_FromOrdinal(' ');
8399 if (!sep || PyUnicode_READY(sep) == -1)
8400 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008401 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008402 else {
8403 if (!PyUnicode_Check(separator)) {
8404 PyErr_Format(PyExc_TypeError,
8405 "separator: expected str instance,"
8406 " %.80s found",
8407 Py_TYPE(separator)->tp_name);
8408 goto onError;
8409 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008410 if (PyUnicode_READY(separator) == -1)
8411 goto onError;
8412 sep = separator;
8413 seplen = PyUnicode_GET_LENGTH(separator);
8414 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8415 /* inc refcount to keep this code path symetric with the
8416 above case of a blank separator */
8417 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008418 }
8419 }
8420
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008421 /* There are at least two things to join, or else we have a subclass
8422 * of str in the sequence.
8423 * Do a pre-pass to figure out the total amount of space we'll
8424 * need (sz), and see whether all argument are strings.
8425 */
8426 sz = 0;
8427 for (i = 0; i < seqlen; i++) {
8428 const Py_ssize_t old_sz = sz;
8429 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 if (!PyUnicode_Check(item)) {
8431 PyErr_Format(PyExc_TypeError,
8432 "sequence item %zd: expected str instance,"
8433 " %.80s found",
8434 i, Py_TYPE(item)->tp_name);
8435 goto onError;
8436 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437 if (PyUnicode_READY(item) == -1)
8438 goto onError;
8439 sz += PyUnicode_GET_LENGTH(item);
8440 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8441 if (item_maxchar > maxchar)
8442 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008443 if (i != 0)
8444 sz += seplen;
8445 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8446 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008448 goto onError;
8449 }
8450 }
Tim Petersced69f82003-09-16 20:30:58 +00008451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008453 if (res == NULL)
8454 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008455
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008456 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008458 Py_ssize_t itemlen;
8459 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 /* Copy item, and maybe the separator. */
8462 if (i) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008463 PyUnicode_CopyCharacters(res, res_offset,
8464 sep, 0, seplen);
8465 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 PyUnicode_CopyCharacters(res, res_offset,
8468 item, 0, itemlen);
8469 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008472
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008474 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008475 Py_XDECREF(sep);
8476 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008479 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008481 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482 return NULL;
8483}
8484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485#define FILL(kind, data, value, start, length) \
8486 do { \
8487 Py_ssize_t i_ = 0; \
8488 assert(kind != PyUnicode_WCHAR_KIND); \
8489 switch ((kind)) { \
8490 case PyUnicode_1BYTE_KIND: { \
8491 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8492 memset(to_, (unsigned char)value, length); \
8493 break; \
8494 } \
8495 case PyUnicode_2BYTE_KIND: { \
8496 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8497 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8498 break; \
8499 } \
8500 default: { \
8501 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8502 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8503 break; \
8504 } \
8505 } \
8506 } while (0)
8507
Alexander Belopolsky40018472011-02-26 01:02:56 +00008508static PyUnicodeObject *
8509pad(PyUnicodeObject *self,
8510 Py_ssize_t left,
8511 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 PyObject *u;
8515 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516
8517 if (left < 0)
8518 left = 0;
8519 if (right < 0)
8520 right = 0;
8521
Tim Peters7a29bd52001-09-12 03:03:31 +00008522 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523 Py_INCREF(self);
8524 return self;
8525 }
8526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8528 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008529 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8530 return NULL;
8531 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8533 if (fill > maxchar)
8534 maxchar = fill;
8535 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536 if (u) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537 int kind = PyUnicode_KIND(u);
8538 void *data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539 if (left)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 FILL(kind, data, fill, 0, left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541 if (right)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
8543 PyUnicode_CopyCharacters(u, left, (PyObject*)self, 0, _PyUnicode_LENGTH(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544 }
8545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549
Alexander Belopolsky40018472011-02-26 01:02:56 +00008550PyObject *
8551PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554
8555 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 switch(PyUnicode_KIND(string)) {
8560 case PyUnicode_1BYTE_KIND:
8561 list = ucs1lib_splitlines(
8562 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8563 PyUnicode_GET_LENGTH(string), keepends);
8564 break;
8565 case PyUnicode_2BYTE_KIND:
8566 list = ucs2lib_splitlines(
8567 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8568 PyUnicode_GET_LENGTH(string), keepends);
8569 break;
8570 case PyUnicode_4BYTE_KIND:
8571 list = ucs4lib_splitlines(
8572 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8573 PyUnicode_GET_LENGTH(string), keepends);
8574 break;
8575 default:
8576 assert(0);
8577 list = 0;
8578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579 Py_DECREF(string);
8580 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581}
8582
Alexander Belopolsky40018472011-02-26 01:02:56 +00008583static PyObject *
8584split(PyUnicodeObject *self,
8585 PyUnicodeObject *substring,
8586 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588 int kind1, kind2, kind;
8589 void *buf1, *buf2;
8590 Py_ssize_t len1, len2;
8591 PyObject* out;
8592
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008594 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 if (PyUnicode_READY(self) == -1)
8597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599 if (substring == NULL)
8600 switch(PyUnicode_KIND(self)) {
8601 case PyUnicode_1BYTE_KIND:
8602 return ucs1lib_split_whitespace(
8603 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8604 PyUnicode_GET_LENGTH(self), maxcount
8605 );
8606 case PyUnicode_2BYTE_KIND:
8607 return ucs2lib_split_whitespace(
8608 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8609 PyUnicode_GET_LENGTH(self), maxcount
8610 );
8611 case PyUnicode_4BYTE_KIND:
8612 return ucs4lib_split_whitespace(
8613 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8614 PyUnicode_GET_LENGTH(self), maxcount
8615 );
8616 default:
8617 assert(0);
8618 return NULL;
8619 }
8620
8621 if (PyUnicode_READY(substring) == -1)
8622 return NULL;
8623
8624 kind1 = PyUnicode_KIND(self);
8625 kind2 = PyUnicode_KIND(substring);
8626 kind = kind1 > kind2 ? kind1 : kind2;
8627 buf1 = PyUnicode_DATA(self);
8628 buf2 = PyUnicode_DATA(substring);
8629 if (kind1 != kind)
8630 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8631 if (!buf1)
8632 return NULL;
8633 if (kind2 != kind)
8634 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8635 if (!buf2) {
8636 if (kind1 != kind) PyMem_Free(buf1);
8637 return NULL;
8638 }
8639 len1 = PyUnicode_GET_LENGTH(self);
8640 len2 = PyUnicode_GET_LENGTH(substring);
8641
8642 switch(kind) {
8643 case PyUnicode_1BYTE_KIND:
8644 out = ucs1lib_split(
8645 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8646 break;
8647 case PyUnicode_2BYTE_KIND:
8648 out = ucs2lib_split(
8649 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8650 break;
8651 case PyUnicode_4BYTE_KIND:
8652 out = ucs4lib_split(
8653 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8654 break;
8655 default:
8656 out = NULL;
8657 }
8658 if (kind1 != kind)
8659 PyMem_Free(buf1);
8660 if (kind2 != kind)
8661 PyMem_Free(buf2);
8662 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663}
8664
Alexander Belopolsky40018472011-02-26 01:02:56 +00008665static PyObject *
8666rsplit(PyUnicodeObject *self,
8667 PyUnicodeObject *substring,
8668 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008669{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 int kind1, kind2, kind;
8671 void *buf1, *buf2;
8672 Py_ssize_t len1, len2;
8673 PyObject* out;
8674
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008675 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008676 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678 if (PyUnicode_READY(self) == -1)
8679 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 if (substring == NULL)
8682 switch(PyUnicode_KIND(self)) {
8683 case PyUnicode_1BYTE_KIND:
8684 return ucs1lib_rsplit_whitespace(
8685 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8686 PyUnicode_GET_LENGTH(self), maxcount
8687 );
8688 case PyUnicode_2BYTE_KIND:
8689 return ucs2lib_rsplit_whitespace(
8690 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8691 PyUnicode_GET_LENGTH(self), maxcount
8692 );
8693 case PyUnicode_4BYTE_KIND:
8694 return ucs4lib_rsplit_whitespace(
8695 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8696 PyUnicode_GET_LENGTH(self), maxcount
8697 );
8698 default:
8699 assert(0);
8700 return NULL;
8701 }
8702
8703 if (PyUnicode_READY(substring) == -1)
8704 return NULL;
8705
8706 kind1 = PyUnicode_KIND(self);
8707 kind2 = PyUnicode_KIND(substring);
8708 kind = kind1 > kind2 ? kind1 : kind2;
8709 buf1 = PyUnicode_DATA(self);
8710 buf2 = PyUnicode_DATA(substring);
8711 if (kind1 != kind)
8712 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8713 if (!buf1)
8714 return NULL;
8715 if (kind2 != kind)
8716 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8717 if (!buf2) {
8718 if (kind1 != kind) PyMem_Free(buf1);
8719 return NULL;
8720 }
8721 len1 = PyUnicode_GET_LENGTH(self);
8722 len2 = PyUnicode_GET_LENGTH(substring);
8723
8724 switch(kind) {
8725 case PyUnicode_1BYTE_KIND:
8726 out = ucs1lib_rsplit(
8727 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8728 break;
8729 case PyUnicode_2BYTE_KIND:
8730 out = ucs2lib_rsplit(
8731 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8732 break;
8733 case PyUnicode_4BYTE_KIND:
8734 out = ucs4lib_rsplit(
8735 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8736 break;
8737 default:
8738 out = NULL;
8739 }
8740 if (kind1 != kind)
8741 PyMem_Free(buf1);
8742 if (kind2 != kind)
8743 PyMem_Free(buf2);
8744 return out;
8745}
8746
8747static Py_ssize_t
8748anylib_find(int kind, void *buf1, Py_ssize_t len1,
8749 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8750{
8751 switch(kind) {
8752 case PyUnicode_1BYTE_KIND:
8753 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8754 case PyUnicode_2BYTE_KIND:
8755 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8756 case PyUnicode_4BYTE_KIND:
8757 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8758 }
8759 assert(0);
8760 return -1;
8761}
8762
8763static Py_ssize_t
8764anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8765 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8766{
8767 switch(kind) {
8768 case PyUnicode_1BYTE_KIND:
8769 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8770 case PyUnicode_2BYTE_KIND:
8771 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8772 case PyUnicode_4BYTE_KIND:
8773 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8774 }
8775 assert(0);
8776 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008777}
8778
Alexander Belopolsky40018472011-02-26 01:02:56 +00008779static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780replace(PyObject *self, PyObject *str1,
8781 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 PyObject *u;
8784 char *sbuf = PyUnicode_DATA(self);
8785 char *buf1 = PyUnicode_DATA(str1);
8786 char *buf2 = PyUnicode_DATA(str2);
8787 int srelease = 0, release1 = 0, release2 = 0;
8788 int skind = PyUnicode_KIND(self);
8789 int kind1 = PyUnicode_KIND(str1);
8790 int kind2 = PyUnicode_KIND(str2);
8791 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8792 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8793 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794
8795 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008798 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 if (skind < kind1)
8801 /* substring too wide to be present */
8802 goto nothing;
8803
8804 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008805 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008806 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008807 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008808 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008810 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811 Py_UCS4 u1, u2, maxchar;
8812 int mayshrink, rkind;
8813 u1 = PyUnicode_READ_CHAR(str1, 0);
8814 if (!findchar(sbuf, PyUnicode_KIND(self),
8815 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008816 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 u2 = PyUnicode_READ_CHAR(str2, 0);
8818 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8819 /* Replacing u1 with u2 may cause a maxchar reduction in the
8820 result string. */
8821 mayshrink = maxchar > 127;
8822 if (u2 > maxchar) {
8823 maxchar = u2;
8824 mayshrink = 0;
8825 }
8826 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008827 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 goto error;
8829 PyUnicode_CopyCharacters(u, 0,
8830 (PyObject*)self, 0, slen);
8831 rkind = PyUnicode_KIND(u);
8832 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8833 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008834 if (--maxcount < 0)
8835 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838 if (mayshrink) {
8839 PyObject *tmp = u;
8840 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8841 PyUnicode_GET_LENGTH(tmp));
8842 Py_DECREF(tmp);
8843 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 int rkind = skind;
8846 char *res;
8847 if (kind1 < rkind) {
8848 /* widen substring */
8849 buf1 = _PyUnicode_AsKind(str1, rkind);
8850 if (!buf1) goto error;
8851 release1 = 1;
8852 }
8853 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008854 if (i < 0)
8855 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 if (rkind > kind2) {
8857 /* widen replacement */
8858 buf2 = _PyUnicode_AsKind(str2, rkind);
8859 if (!buf2) goto error;
8860 release2 = 1;
8861 }
8862 else if (rkind < kind2) {
8863 /* widen self and buf1 */
8864 rkind = kind2;
8865 if (release1) PyMem_Free(buf1);
8866 sbuf = _PyUnicode_AsKind(self, rkind);
8867 if (!sbuf) goto error;
8868 srelease = 1;
8869 buf1 = _PyUnicode_AsKind(str1, rkind);
8870 if (!buf1) goto error;
8871 release1 = 1;
8872 }
8873 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8874 if (!res) {
8875 PyErr_NoMemory();
8876 goto error;
8877 }
8878 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008879 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008880 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8881 buf2,
8882 PyUnicode_KIND_SIZE(rkind, len2));
8883 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008884
8885 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8887 slen-i,
8888 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008889 if (i == -1)
8890 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8892 buf2,
8893 PyUnicode_KIND_SIZE(rkind, len2));
8894 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896
8897 u = PyUnicode_FromKindAndData(rkind, res, slen);
8898 PyMem_Free(res);
8899 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 Py_ssize_t n, i, j, ires;
8904 Py_ssize_t product, new_size;
8905 int rkind = skind;
8906 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 if (kind1 < rkind) {
8909 buf1 = _PyUnicode_AsKind(str1, rkind);
8910 if (!buf1) goto error;
8911 release1 = 1;
8912 }
8913 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008914 if (n == 0)
8915 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 if (kind2 < rkind) {
8917 buf2 = _PyUnicode_AsKind(str2, rkind);
8918 if (!buf2) goto error;
8919 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008921 else if (kind2 > rkind) {
8922 rkind = kind2;
8923 sbuf = _PyUnicode_AsKind(self, rkind);
8924 if (!sbuf) goto error;
8925 srelease = 1;
8926 if (release1) PyMem_Free(buf1);
8927 buf1 = _PyUnicode_AsKind(str1, rkind);
8928 if (!buf1) goto error;
8929 release1 = 1;
8930 }
8931 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
8932 PyUnicode_GET_LENGTH(str1))); */
8933 product = n * (len2-len1);
8934 if ((product / (len2-len1)) != n) {
8935 PyErr_SetString(PyExc_OverflowError,
8936 "replace string is too long");
8937 goto error;
8938 }
8939 new_size = slen + product;
8940 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
8941 PyErr_SetString(PyExc_OverflowError,
8942 "replace string is too long");
8943 goto error;
8944 }
8945 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
8946 if (!res)
8947 goto error;
8948 ires = i = 0;
8949 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008950 while (n-- > 0) {
8951 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 j = anylib_find(rkind,
8953 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8954 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008955 if (j == -1)
8956 break;
8957 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008958 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8960 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8961 PyUnicode_KIND_SIZE(rkind, j-i));
8962 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008963 }
8964 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 if (len2 > 0) {
8966 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8967 buf2,
8968 PyUnicode_KIND_SIZE(rkind, len2));
8969 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008970 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00008974 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8976 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8977 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00008978 } else {
8979 /* interleave */
8980 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8982 buf2,
8983 PyUnicode_KIND_SIZE(rkind, len2));
8984 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008985 if (--n <= 0)
8986 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8988 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8989 PyUnicode_KIND_SIZE(rkind, 1));
8990 ires++;
8991 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008992 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008993 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8994 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8995 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00008996 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999 if (srelease)
9000 PyMem_FREE(sbuf);
9001 if (release1)
9002 PyMem_FREE(buf1);
9003 if (release2)
9004 PyMem_FREE(buf2);
9005 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009006
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009008 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 if (srelease)
9010 PyMem_FREE(sbuf);
9011 if (release1)
9012 PyMem_FREE(buf1);
9013 if (release2)
9014 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009015 if (PyUnicode_CheckExact(self)) {
9016 Py_INCREF(self);
9017 return (PyObject *) self;
9018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009019 return PyUnicode_FromKindAndData(PyUnicode_KIND(self),
9020 PyUnicode_DATA(self),
9021 PyUnicode_GET_LENGTH(self));
9022 error:
9023 if (srelease && sbuf)
9024 PyMem_FREE(sbuf);
9025 if (release1 && buf1)
9026 PyMem_FREE(buf1);
9027 if (release2 && buf2)
9028 PyMem_FREE(buf2);
9029 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030}
9031
9032/* --- Unicode Object Methods --------------------------------------------- */
9033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009034PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009035 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036\n\
9037Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009038characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039
9040static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009041unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 return fixup(self, fixtitle);
9044}
9045
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009046PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009047 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048\n\
9049Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009050have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051
9052static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009053unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 return fixup(self, fixcapitalize);
9056}
9057
9058#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009059PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009060 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061\n\
9062Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009063normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009064
9065static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009066unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067{
9068 PyObject *list;
9069 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009070 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072 /* Split into words */
9073 list = split(self, NULL, -1);
9074 if (!list)
9075 return NULL;
9076
9077 /* Capitalize each word */
9078 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9079 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009080 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081 if (item == NULL)
9082 goto onError;
9083 Py_DECREF(PyList_GET_ITEM(list, i));
9084 PyList_SET_ITEM(list, i, item);
9085 }
9086
9087 /* Join the words to form a new string */
9088 item = PyUnicode_Join(NULL, list);
9089
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 Py_DECREF(list);
9092 return (PyObject *)item;
9093}
9094#endif
9095
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009096/* Argument converter. Coerces to a single unicode character */
9097
9098static int
9099convert_uc(PyObject *obj, void *addr)
9100{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009102 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009103
Benjamin Peterson14339b62009-01-31 16:36:08 +00009104 uniobj = PyUnicode_FromObject(obj);
9105 if (uniobj == NULL) {
9106 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009107 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009108 return 0;
9109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009111 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009112 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009113 Py_DECREF(uniobj);
9114 return 0;
9115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116 if (PyUnicode_READY(uniobj)) {
9117 Py_DECREF(uniobj);
9118 return 0;
9119 }
9120 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009121 Py_DECREF(uniobj);
9122 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009123}
9124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009125PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009128Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009129done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130
9131static PyObject *
9132unicode_center(PyUnicodeObject *self, PyObject *args)
9133{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009134 Py_ssize_t marg, left;
9135 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 Py_UCS4 fillchar = ' ';
9137
9138 if (PyUnicode_READY(self) == -1)
9139 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140
Thomas Woutersde017742006-02-16 19:34:37 +00009141 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 return NULL;
9143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 Py_INCREF(self);
9146 return (PyObject*) self;
9147 }
9148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009149 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 left = marg / 2 + (marg & width & 1);
9151
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009152 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153}
9154
Marc-André Lemburge5034372000-08-08 08:04:29 +00009155#if 0
9156
9157/* This code should go into some future Unicode collation support
9158 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009159 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009160
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009161/* speedy UTF-16 code point order comparison */
9162/* gleaned from: */
9163/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9164
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009165static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009166{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009167 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009168 0, 0, 0, 0, 0, 0, 0, 0,
9169 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009170 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009171};
9172
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173static int
9174unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9175{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009176 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009177
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178 Py_UNICODE *s1 = str1->str;
9179 Py_UNICODE *s2 = str2->str;
9180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 len1 = str1->_base._base.length;
9182 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009183
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009185 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009186
9187 c1 = *s1++;
9188 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009189
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 if (c1 > (1<<11) * 26)
9191 c1 += utf16Fixup[c1>>11];
9192 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009193 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009194 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009195
9196 if (c1 != c2)
9197 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009198
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009199 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200 }
9201
9202 return (len1 < len2) ? -1 : (len1 != len2);
9203}
9204
Marc-André Lemburge5034372000-08-08 08:04:29 +00009205#else
9206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207/* This function assumes that str1 and str2 are readied by the caller. */
9208
Marc-André Lemburge5034372000-08-08 08:04:29 +00009209static int
9210unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9211{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212 int kind1, kind2;
9213 void *data1, *data2;
9214 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216 kind1 = PyUnicode_KIND(str1);
9217 kind2 = PyUnicode_KIND(str2);
9218 data1 = PyUnicode_DATA(str1);
9219 data2 = PyUnicode_DATA(str2);
9220 len1 = PyUnicode_GET_LENGTH(str1);
9221 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 for (i = 0; i < len1 && i < len2; ++i) {
9224 Py_UCS4 c1, c2;
9225 c1 = PyUnicode_READ(kind1, data1, i);
9226 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009227
9228 if (c1 != c2)
9229 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009230 }
9231
9232 return (len1 < len2) ? -1 : (len1 != len2);
9233}
9234
9235#endif
9236
Alexander Belopolsky40018472011-02-26 01:02:56 +00009237int
9238PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9241 if (PyUnicode_READY(left) == -1 ||
9242 PyUnicode_READY(right) == -1)
9243 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009244 return unicode_compare((PyUnicodeObject *)left,
9245 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009247 PyErr_Format(PyExc_TypeError,
9248 "Can't compare %.100s and %.100s",
9249 left->ob_type->tp_name,
9250 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009251 return -1;
9252}
9253
Martin v. Löwis5b222132007-06-10 09:51:05 +00009254int
9255PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9256{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 Py_ssize_t i;
9258 int kind;
9259 void *data;
9260 Py_UCS4 chr;
9261
Martin v. Löwis5b222132007-06-10 09:51:05 +00009262 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 if (PyUnicode_READY(uni) == -1)
9264 return -1;
9265 kind = PyUnicode_KIND(uni);
9266 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009267 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9269 if (chr != str[i])
9270 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009271 /* This check keeps Python strings that end in '\0' from comparing equal
9272 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009274 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009275 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009276 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009277 return 0;
9278}
9279
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009280
Benjamin Peterson29060642009-01-31 22:14:21 +00009281#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009282 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009283
Alexander Belopolsky40018472011-02-26 01:02:56 +00009284PyObject *
9285PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009286{
9287 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009288
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009289 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9290 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 if (PyUnicode_READY(left) == -1 ||
9292 PyUnicode_READY(right) == -1)
9293 return NULL;
9294 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9295 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009296 if (op == Py_EQ) {
9297 Py_INCREF(Py_False);
9298 return Py_False;
9299 }
9300 if (op == Py_NE) {
9301 Py_INCREF(Py_True);
9302 return Py_True;
9303 }
9304 }
9305 if (left == right)
9306 result = 0;
9307 else
9308 result = unicode_compare((PyUnicodeObject *)left,
9309 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009310
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009311 /* Convert the return value to a Boolean */
9312 switch (op) {
9313 case Py_EQ:
9314 v = TEST_COND(result == 0);
9315 break;
9316 case Py_NE:
9317 v = TEST_COND(result != 0);
9318 break;
9319 case Py_LE:
9320 v = TEST_COND(result <= 0);
9321 break;
9322 case Py_GE:
9323 v = TEST_COND(result >= 0);
9324 break;
9325 case Py_LT:
9326 v = TEST_COND(result == -1);
9327 break;
9328 case Py_GT:
9329 v = TEST_COND(result == 1);
9330 break;
9331 default:
9332 PyErr_BadArgument();
9333 return NULL;
9334 }
9335 Py_INCREF(v);
9336 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009337 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009338
Brian Curtindfc80e32011-08-10 20:28:54 -05009339 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009340}
9341
Alexander Belopolsky40018472011-02-26 01:02:56 +00009342int
9343PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009344{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009345 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 int kind1, kind2, kind;
9347 void *buf1, *buf2;
9348 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009349 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009350
9351 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009352 sub = PyUnicode_FromObject(element);
9353 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009354 PyErr_Format(PyExc_TypeError,
9355 "'in <string>' requires string as left operand, not %s",
9356 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009357 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 if (PyUnicode_READY(sub) == -1)
9360 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009361
Thomas Wouters477c8d52006-05-27 19:21:47 +00009362 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009364 Py_DECREF(sub);
9365 return -1;
9366 }
9367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009368 kind1 = PyUnicode_KIND(str);
9369 kind2 = PyUnicode_KIND(sub);
9370 kind = kind1 > kind2 ? kind1 : kind2;
9371 buf1 = PyUnicode_DATA(str);
9372 buf2 = PyUnicode_DATA(sub);
9373 if (kind1 != kind)
9374 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9375 if (!buf1) {
9376 Py_DECREF(sub);
9377 return -1;
9378 }
9379 if (kind2 != kind)
9380 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9381 if (!buf2) {
9382 Py_DECREF(sub);
9383 if (kind1 != kind) PyMem_Free(buf1);
9384 return -1;
9385 }
9386 len1 = PyUnicode_GET_LENGTH(str);
9387 len2 = PyUnicode_GET_LENGTH(sub);
9388
9389 switch(kind) {
9390 case PyUnicode_1BYTE_KIND:
9391 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9392 break;
9393 case PyUnicode_2BYTE_KIND:
9394 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9395 break;
9396 case PyUnicode_4BYTE_KIND:
9397 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9398 break;
9399 default:
9400 result = -1;
9401 assert(0);
9402 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009403
9404 Py_DECREF(str);
9405 Py_DECREF(sub);
9406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009407 if (kind1 != kind)
9408 PyMem_Free(buf1);
9409 if (kind2 != kind)
9410 PyMem_Free(buf2);
9411
Guido van Rossum403d68b2000-03-13 15:55:09 +00009412 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009413}
9414
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415/* Concat to string or Unicode object giving a new Unicode object. */
9416
Alexander Belopolsky40018472011-02-26 01:02:56 +00009417PyObject *
9418PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 PyObject *u = NULL, *v = NULL, *w;
9421 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422
9423 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009426 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009430
9431 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009433 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009437 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439 }
9440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9442 goto onError;
9443
9444 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
9445 if (PyUnicode_MAX_CHAR_VALUE(v) > maxchar)
9446 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
9447
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 w = PyUnicode_New(
9450 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9451 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009453 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
9455 PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u), v, 0,
9456 PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457 Py_DECREF(u);
9458 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009460
Benjamin Peterson29060642009-01-31 22:14:21 +00009461 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462 Py_XDECREF(u);
9463 Py_XDECREF(v);
9464 return NULL;
9465}
9466
Walter Dörwald1ab83302007-05-18 17:15:44 +00009467void
9468PyUnicode_Append(PyObject **pleft, PyObject *right)
9469{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009470 PyObject *new;
9471 if (*pleft == NULL)
9472 return;
9473 if (right == NULL || !PyUnicode_Check(*pleft)) {
9474 Py_DECREF(*pleft);
9475 *pleft = NULL;
9476 return;
9477 }
9478 new = PyUnicode_Concat(*pleft, right);
9479 Py_DECREF(*pleft);
9480 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009481}
9482
9483void
9484PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9485{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009486 PyUnicode_Append(pleft, right);
9487 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009488}
9489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009490PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009491 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009493Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009494string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009495interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496
9497static PyObject *
9498unicode_count(PyUnicodeObject *self, PyObject *args)
9499{
9500 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009501 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009502 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 int kind1, kind2, kind;
9505 void *buf1, *buf2;
9506 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507
Jesus Ceaac451502011-04-20 17:09:23 +02009508 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9509 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 kind1 = PyUnicode_KIND(self);
9513 kind2 = PyUnicode_KIND(substring);
9514 kind = kind1 > kind2 ? kind1 : kind2;
9515 buf1 = PyUnicode_DATA(self);
9516 buf2 = PyUnicode_DATA(substring);
9517 if (kind1 != kind)
9518 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9519 if (!buf1) {
9520 Py_DECREF(substring);
9521 return NULL;
9522 }
9523 if (kind2 != kind)
9524 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9525 if (!buf2) {
9526 Py_DECREF(substring);
9527 if (kind1 != kind) PyMem_Free(buf1);
9528 return NULL;
9529 }
9530 len1 = PyUnicode_GET_LENGTH(self);
9531 len2 = PyUnicode_GET_LENGTH(substring);
9532
9533 ADJUST_INDICES(start, end, len1);
9534 switch(kind) {
9535 case PyUnicode_1BYTE_KIND:
9536 iresult = ucs1lib_count(
9537 ((Py_UCS1*)buf1) + start, end - start,
9538 buf2, len2, PY_SSIZE_T_MAX
9539 );
9540 break;
9541 case PyUnicode_2BYTE_KIND:
9542 iresult = ucs2lib_count(
9543 ((Py_UCS2*)buf1) + start, end - start,
9544 buf2, len2, PY_SSIZE_T_MAX
9545 );
9546 break;
9547 case PyUnicode_4BYTE_KIND:
9548 iresult = ucs4lib_count(
9549 ((Py_UCS4*)buf1) + start, end - start,
9550 buf2, len2, PY_SSIZE_T_MAX
9551 );
9552 break;
9553 default:
9554 assert(0); iresult = 0;
9555 }
9556
9557 result = PyLong_FromSsize_t(iresult);
9558
9559 if (kind1 != kind)
9560 PyMem_Free(buf1);
9561 if (kind2 != kind)
9562 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563
9564 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009565
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566 return result;
9567}
9568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009569PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009570 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009572Encode S using the codec registered for encoding. Default encoding\n\
9573is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009574handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009575a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9576'xmlcharrefreplace' as well as any other name registered with\n\
9577codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578
9579static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009580unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009582 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583 char *encoding = NULL;
9584 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009585
Benjamin Peterson308d6372009-09-18 21:42:35 +00009586 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9587 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009588 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009589 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009590}
9591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009592PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009593 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594\n\
9595Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009596If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597
9598static PyObject*
9599unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9600{
9601 Py_UNICODE *e;
9602 Py_UNICODE *p;
9603 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009604 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606 PyUnicodeObject *u;
9607 int tabsize = 8;
9608
9609 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9613 return NULL;
9614
Thomas Wouters7e474022000-07-16 12:04:32 +00009615 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009616 i = 0; /* chars up to and including most recent \n or \r */
9617 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9619 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009621 if (tabsize > 0) {
9622 incr = tabsize - (j % tabsize); /* cannot overflow */
9623 if (j > PY_SSIZE_T_MAX - incr)
9624 goto overflow1;
9625 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009626 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009629 if (j > PY_SSIZE_T_MAX - 1)
9630 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631 j++;
9632 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009633 if (i > PY_SSIZE_T_MAX - j)
9634 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009636 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637 }
9638 }
9639
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009640 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009641 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009642
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643 /* Second pass: create output string and fill it */
9644 u = _PyUnicode_New(i + j);
9645 if (!u)
9646 return NULL;
9647
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009648 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 q = _PyUnicode_WSTR(u); /* next output char */
9650 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009654 if (tabsize > 0) {
9655 i = tabsize - (j % tabsize);
9656 j += i;
9657 while (i--) {
9658 if (q >= qe)
9659 goto overflow2;
9660 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009661 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009662 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009663 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009664 else {
9665 if (q >= qe)
9666 goto overflow2;
9667 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009668 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669 if (*p == '\n' || *p == '\r')
9670 j = 0;
9671 }
9672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 if (PyUnicode_READY(u) == -1) {
9674 Py_DECREF(u);
9675 return NULL;
9676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009678
9679 overflow2:
9680 Py_DECREF(u);
9681 overflow1:
9682 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9683 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684}
9685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009686PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009687 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688\n\
9689Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009690such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691arguments start and end are interpreted as in slice notation.\n\
9692\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009693Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694
9695static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697{
Jesus Ceaac451502011-04-20 17:09:23 +02009698 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009699 Py_ssize_t start;
9700 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009701 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702
Jesus Ceaac451502011-04-20 17:09:23 +02009703 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9704 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 if (PyUnicode_READY(self) == -1)
9708 return NULL;
9709 if (PyUnicode_READY(substring) == -1)
9710 return NULL;
9711
9712 result = any_find_slice(
9713 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9714 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009715 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716
9717 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009719 if (result == -2)
9720 return NULL;
9721
Christian Heimes217cfd12007-12-02 14:31:20 +00009722 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009723}
9724
9725static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009726unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 Py_UCS4 ch;
9729
9730 if (PyUnicode_READY(self) == -1)
9731 return NULL;
9732 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009733 PyErr_SetString(PyExc_IndexError, "string index out of range");
9734 return NULL;
9735 }
9736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9738 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009739}
9740
Guido van Rossumc2504932007-09-18 19:42:40 +00009741/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009742 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009743static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009744unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745{
Guido van Rossumc2504932007-09-18 19:42:40 +00009746 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009747 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009749 if (_PyUnicode_HASH(self) != -1)
9750 return _PyUnicode_HASH(self);
9751 if (PyUnicode_READY(self) == -1)
9752 return -1;
9753 len = PyUnicode_GET_LENGTH(self);
9754
9755 /* The hash function as a macro, gets expanded three times below. */
9756#define HASH(P) \
9757 x = (Py_uhash_t)*P << 7; \
9758 while (--len >= 0) \
9759 x = (1000003*x) ^ (Py_uhash_t)*P++;
9760
9761 switch (PyUnicode_KIND(self)) {
9762 case PyUnicode_1BYTE_KIND: {
9763 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9764 HASH(c);
9765 break;
9766 }
9767 case PyUnicode_2BYTE_KIND: {
9768 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9769 HASH(s);
9770 break;
9771 }
9772 default: {
9773 Py_UCS4 *l;
9774 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9775 "Impossible switch case in unicode_hash");
9776 l = PyUnicode_4BYTE_DATA(self);
9777 HASH(l);
9778 break;
9779 }
9780 }
9781 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9782
Guido van Rossumc2504932007-09-18 19:42:40 +00009783 if (x == -1)
9784 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009786 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009790PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009791 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009792\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009793Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794
9795static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009798 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009799 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009800 Py_ssize_t start;
9801 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802
Jesus Ceaac451502011-04-20 17:09:23 +02009803 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9804 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 if (PyUnicode_READY(self) == -1)
9808 return NULL;
9809 if (PyUnicode_READY(substring) == -1)
9810 return NULL;
9811
9812 result = any_find_slice(
9813 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9814 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009815 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816
9817 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 if (result == -2)
9820 return NULL;
9821
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822 if (result < 0) {
9823 PyErr_SetString(PyExc_ValueError, "substring not found");
9824 return NULL;
9825 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009826
Christian Heimes217cfd12007-12-02 14:31:20 +00009827 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828}
9829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009830PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009831 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009833Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009834at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835
9836static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009837unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009838{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 Py_ssize_t i, length;
9840 int kind;
9841 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009842 int cased;
9843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 if (PyUnicode_READY(self) == -1)
9845 return NULL;
9846 length = PyUnicode_GET_LENGTH(self);
9847 kind = PyUnicode_KIND(self);
9848 data = PyUnicode_DATA(self);
9849
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 if (length == 1)
9852 return PyBool_FromLong(
9853 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009855 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009857 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009858
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860 for (i = 0; i < length; i++) {
9861 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009862
Benjamin Peterson29060642009-01-31 22:14:21 +00009863 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9864 return PyBool_FromLong(0);
9865 else if (!cased && Py_UNICODE_ISLOWER(ch))
9866 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009868 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869}
9870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009871PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009872 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009874Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009875at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009876
9877static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009878unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 Py_ssize_t i, length;
9881 int kind;
9882 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883 int cased;
9884
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885 if (PyUnicode_READY(self) == -1)
9886 return NULL;
9887 length = PyUnicode_GET_LENGTH(self);
9888 kind = PyUnicode_KIND(self);
9889 data = PyUnicode_DATA(self);
9890
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 if (length == 1)
9893 return PyBool_FromLong(
9894 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009896 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009898 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009899
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 for (i = 0; i < length; i++) {
9902 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009903
Benjamin Peterson29060642009-01-31 22:14:21 +00009904 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9905 return PyBool_FromLong(0);
9906 else if (!cased && Py_UNICODE_ISUPPER(ch))
9907 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009908 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009909 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910}
9911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009912PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009913 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009915Return True if S is a titlecased string and there is at least one\n\
9916character in S, i.e. upper- and titlecase characters may only\n\
9917follow uncased characters and lowercase characters only cased ones.\n\
9918Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919
9920static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009921unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009922{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 Py_ssize_t i, length;
9924 int kind;
9925 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926 int cased, previous_is_cased;
9927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 if (PyUnicode_READY(self) == -1)
9929 return NULL;
9930 length = PyUnicode_GET_LENGTH(self);
9931 kind = PyUnicode_KIND(self);
9932 data = PyUnicode_DATA(self);
9933
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 if (length == 1) {
9936 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
9937 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
9938 (Py_UNICODE_ISUPPER(ch) != 0));
9939 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009941 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009943 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009944
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945 cased = 0;
9946 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 for (i = 0; i < length; i++) {
9948 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009949
Benjamin Peterson29060642009-01-31 22:14:21 +00009950 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
9951 if (previous_is_cased)
9952 return PyBool_FromLong(0);
9953 previous_is_cased = 1;
9954 cased = 1;
9955 }
9956 else if (Py_UNICODE_ISLOWER(ch)) {
9957 if (!previous_is_cased)
9958 return PyBool_FromLong(0);
9959 previous_is_cased = 1;
9960 cased = 1;
9961 }
9962 else
9963 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009965 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966}
9967
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009968PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009969 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009971Return True if all characters in S are whitespace\n\
9972and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973
9974static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009975unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 Py_ssize_t i, length;
9978 int kind;
9979 void *data;
9980
9981 if (PyUnicode_READY(self) == -1)
9982 return NULL;
9983 length = PyUnicode_GET_LENGTH(self);
9984 kind = PyUnicode_KIND(self);
9985 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 if (length == 1)
9989 return PyBool_FromLong(
9990 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009992 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009994 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 for (i = 0; i < length; i++) {
9997 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03009998 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00009999 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010000 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010001 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002}
10003
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010004PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010005 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010006\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010007Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010008and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010009
10010static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010011unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 Py_ssize_t i, length;
10014 int kind;
10015 void *data;
10016
10017 if (PyUnicode_READY(self) == -1)
10018 return NULL;
10019 length = PyUnicode_GET_LENGTH(self);
10020 kind = PyUnicode_KIND(self);
10021 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010022
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010023 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 if (length == 1)
10025 return PyBool_FromLong(
10026 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010027
10028 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010030 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 for (i = 0; i < length; i++) {
10033 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010034 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010035 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010036 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010037}
10038
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010039PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010040 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010041\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010042Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010043and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010044
10045static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010046unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010047{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 int kind;
10049 void *data;
10050 Py_ssize_t len, i;
10051
10052 if (PyUnicode_READY(self) == -1)
10053 return NULL;
10054
10055 kind = PyUnicode_KIND(self);
10056 data = PyUnicode_DATA(self);
10057 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010058
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010059 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 if (len == 1) {
10061 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10062 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10063 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010064
10065 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010067 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 for (i = 0; i < len; i++) {
10070 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010071 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010072 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010073 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010074 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010075}
10076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010077PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010078 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010080Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010081False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082
10083static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010084unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010085{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 Py_ssize_t i, length;
10087 int kind;
10088 void *data;
10089
10090 if (PyUnicode_READY(self) == -1)
10091 return NULL;
10092 length = PyUnicode_GET_LENGTH(self);
10093 kind = PyUnicode_KIND(self);
10094 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095
Guido van Rossumd57fd912000-03-10 22:53:23 +000010096 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 if (length == 1)
10098 return PyBool_FromLong(
10099 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010101 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010103 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 for (i = 0; i < length; i++) {
10106 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010107 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010109 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110}
10111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010112PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010113 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010114\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010115Return True if all characters in S are digits\n\
10116and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117
10118static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010119unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010120{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 Py_ssize_t i, length;
10122 int kind;
10123 void *data;
10124
10125 if (PyUnicode_READY(self) == -1)
10126 return NULL;
10127 length = PyUnicode_GET_LENGTH(self);
10128 kind = PyUnicode_KIND(self);
10129 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010130
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 if (length == 1) {
10133 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10134 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010137 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010139 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 for (i = 0; i < length; i++) {
10142 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010143 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010145 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146}
10147
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010148PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010149 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010151Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010152False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153
10154static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010155unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 Py_ssize_t i, length;
10158 int kind;
10159 void *data;
10160
10161 if (PyUnicode_READY(self) == -1)
10162 return NULL;
10163 length = PyUnicode_GET_LENGTH(self);
10164 kind = PyUnicode_KIND(self);
10165 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 if (length == 1)
10169 return PyBool_FromLong(
10170 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010172 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010174 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 for (i = 0; i < length; i++) {
10177 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010178 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010180 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181}
10182
Martin v. Löwis47383402007-08-15 07:32:56 +000010183int
10184PyUnicode_IsIdentifier(PyObject *self)
10185{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 int kind;
10187 void *data;
10188 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010189 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 if (PyUnicode_READY(self) == -1) {
10192 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010193 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 }
10195
10196 /* Special case for empty strings */
10197 if (PyUnicode_GET_LENGTH(self) == 0)
10198 return 0;
10199 kind = PyUnicode_KIND(self);
10200 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010201
10202 /* PEP 3131 says that the first character must be in
10203 XID_Start and subsequent characters in XID_Continue,
10204 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010205 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010206 letters, digits, underscore). However, given the current
10207 definition of XID_Start and XID_Continue, it is sufficient
10208 to check just for these, except that _ must be allowed
10209 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010211 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010212 return 0;
10213
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010214 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010216 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010217 return 1;
10218}
10219
10220PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010221 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010222\n\
10223Return True if S is a valid identifier according\n\
10224to the language definition.");
10225
10226static PyObject*
10227unicode_isidentifier(PyObject *self)
10228{
10229 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10230}
10231
Georg Brandl559e5d72008-06-11 18:37:52 +000010232PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010233 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010234\n\
10235Return True if all characters in S are considered\n\
10236printable in repr() or S is empty, False otherwise.");
10237
10238static PyObject*
10239unicode_isprintable(PyObject *self)
10240{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 Py_ssize_t i, length;
10242 int kind;
10243 void *data;
10244
10245 if (PyUnicode_READY(self) == -1)
10246 return NULL;
10247 length = PyUnicode_GET_LENGTH(self);
10248 kind = PyUnicode_KIND(self);
10249 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010250
10251 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 if (length == 1)
10253 return PyBool_FromLong(
10254 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 for (i = 0; i < length; i++) {
10257 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010258 Py_RETURN_FALSE;
10259 }
10260 }
10261 Py_RETURN_TRUE;
10262}
10263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010264PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010265 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266\n\
10267Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010268iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269
10270static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010271unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010273 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274}
10275
Martin v. Löwis18e16552006-02-15 17:27:45 +000010276static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277unicode_length(PyUnicodeObject *self)
10278{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 if (PyUnicode_READY(self) == -1)
10280 return -1;
10281 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282}
10283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010284PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010285 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010287Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010288done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010289
10290static PyObject *
10291unicode_ljust(PyUnicodeObject *self, PyObject *args)
10292{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010293 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 Py_UCS4 fillchar = ' ';
10295
10296 if (PyUnicode_READY(self) == -1)
10297 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010298
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010299 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300 return NULL;
10301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303 Py_INCREF(self);
10304 return (PyObject*) self;
10305 }
10306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308}
10309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010310PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010311 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010313Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314
10315static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010316unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318 return fixup(self, fixlower);
10319}
10320
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010321#define LEFTSTRIP 0
10322#define RIGHTSTRIP 1
10323#define BOTHSTRIP 2
10324
10325/* Arrays indexed by above */
10326static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10327
10328#define STRIPNAME(i) (stripformat[i]+3)
10329
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010330/* externally visible for str.strip(unicode) */
10331PyObject *
10332_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10333{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 void *data;
10335 int kind;
10336 Py_ssize_t i, j, len;
10337 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10340 return NULL;
10341
10342 kind = PyUnicode_KIND(self);
10343 data = PyUnicode_DATA(self);
10344 len = PyUnicode_GET_LENGTH(self);
10345 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10346 PyUnicode_DATA(sepobj),
10347 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010348
Benjamin Peterson14339b62009-01-31 16:36:08 +000010349 i = 0;
10350 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 while (i < len &&
10352 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010353 i++;
10354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010355 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010356
Benjamin Peterson14339b62009-01-31 16:36:08 +000010357 j = len;
10358 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010359 do {
10360 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 } while (j >= i &&
10362 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010363 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010364 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010365
Benjamin Peterson14339b62009-01-31 16:36:08 +000010366 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010367 Py_INCREF(self);
10368 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010369 }
10370 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010372}
10373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374/* Assumes an already ready self string. */
10375
10376static PyObject *
10377substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10378{
10379 const int kind = PyUnicode_KIND(self);
10380 void *data = PyUnicode_DATA(self);
10381 Py_UCS4 maxchar = 0;
10382 Py_ssize_t i;
10383 PyObject *unicode;
10384
10385 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10386 PyErr_BadInternalCall();
10387 return NULL;
10388 }
10389
10390 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10391 Py_INCREF(self);
10392 return (PyObject*)self;
10393 }
10394
10395 for (i = 0; i < len; ++i) {
10396 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10397 if (ch > maxchar)
10398 maxchar = ch;
10399 }
10400
10401 unicode = PyUnicode_New(len, maxchar);
10402 if (unicode == NULL)
10403 return NULL;
10404 PyUnicode_CopyCharacters(unicode, 0,
10405 (PyObject*)self, start, len);
10406 return unicode;
10407}
10408
10409PyObject*
10410PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10411{
10412 unsigned char *data;
10413 int kind;
10414
10415 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10416 && PyUnicode_CheckExact(self))
10417 {
10418 Py_INCREF(self);
10419 return (PyObject *)self;
10420 }
10421
10422 if ((end - start) == 1)
10423 return unicode_getitem((PyUnicodeObject*)self, start);
10424
10425 if (PyUnicode_READY(self) == -1)
10426 return NULL;
10427 kind = PyUnicode_KIND(self);
10428 data = PyUnicode_1BYTE_DATA(self);
10429 return PyUnicode_FromKindAndData(kind, data + PyUnicode_KIND_SIZE(kind, start),
10430 end-start);
10431}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432
10433static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010434do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 int kind;
10437 void *data;
10438 Py_ssize_t len, i, j;
10439
10440 if (PyUnicode_READY(self) == -1)
10441 return NULL;
10442
10443 kind = PyUnicode_KIND(self);
10444 data = PyUnicode_DATA(self);
10445 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010446
Benjamin Peterson14339b62009-01-31 16:36:08 +000010447 i = 0;
10448 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010450 i++;
10451 }
10452 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010453
Benjamin Peterson14339b62009-01-31 16:36:08 +000010454 j = len;
10455 if (striptype != LEFTSTRIP) {
10456 do {
10457 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010459 j++;
10460 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010461
Benjamin Peterson14339b62009-01-31 16:36:08 +000010462 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10463 Py_INCREF(self);
10464 return (PyObject*)self;
10465 }
10466 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010468}
10469
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010470
10471static PyObject *
10472do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10473{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010474 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010475
Benjamin Peterson14339b62009-01-31 16:36:08 +000010476 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10477 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010478
Benjamin Peterson14339b62009-01-31 16:36:08 +000010479 if (sep != NULL && sep != Py_None) {
10480 if (PyUnicode_Check(sep))
10481 return _PyUnicode_XStrip(self, striptype, sep);
10482 else {
10483 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010484 "%s arg must be None or str",
10485 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010486 return NULL;
10487 }
10488 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010489
Benjamin Peterson14339b62009-01-31 16:36:08 +000010490 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010491}
10492
10493
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010494PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010495 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010496\n\
10497Return a copy of the string S with leading and trailing\n\
10498whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010499If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010500
10501static PyObject *
10502unicode_strip(PyUnicodeObject *self, PyObject *args)
10503{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010504 if (PyTuple_GET_SIZE(args) == 0)
10505 return do_strip(self, BOTHSTRIP); /* Common case */
10506 else
10507 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010508}
10509
10510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010511PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010512 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010513\n\
10514Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010515If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010516
10517static PyObject *
10518unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10519{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010520 if (PyTuple_GET_SIZE(args) == 0)
10521 return do_strip(self, LEFTSTRIP); /* Common case */
10522 else
10523 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010524}
10525
10526
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010527PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010528 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010529\n\
10530Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010531If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010532
10533static PyObject *
10534unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10535{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010536 if (PyTuple_GET_SIZE(args) == 0)
10537 return do_strip(self, RIGHTSTRIP); /* Common case */
10538 else
10539 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010540}
10541
10542
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010544unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545{
10546 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 Py_ssize_t nchars, n;
10548 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549
Georg Brandl222de0f2009-04-12 12:01:50 +000010550 if (len < 1) {
10551 Py_INCREF(unicode_empty);
10552 return (PyObject *)unicode_empty;
10553 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554
Tim Peters7a29bd52001-09-12 03:03:31 +000010555 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010556 /* no repeat, return original string */
10557 Py_INCREF(str);
10558 return (PyObject*) str;
10559 }
Tim Peters8f422462000-09-09 06:13:41 +000010560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 if (PyUnicode_READY(str) == -1)
10562 return NULL;
10563
Tim Peters8f422462000-09-09 06:13:41 +000010564 /* ensure # of chars needed doesn't overflow int and # of bytes
10565 * needed doesn't overflow size_t
10566 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 nchars = len * PyUnicode_GET_LENGTH(str);
10568 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010569 PyErr_SetString(PyExc_OverflowError,
10570 "repeated string is too long");
10571 return NULL;
10572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 char_size = PyUnicode_CHARACTER_SIZE(str);
10574 nbytes = (nchars + 1) * char_size;
10575 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010576 PyErr_SetString(PyExc_OverflowError,
10577 "repeated string is too long");
10578 return NULL;
10579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581 if (!u)
10582 return NULL;
10583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 if (PyUnicode_GET_LENGTH(str) == 1) {
10585 const int kind = PyUnicode_KIND(str);
10586 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10587 void *to = PyUnicode_DATA(u);
10588 for (n = 0; n < len; ++n)
10589 PyUnicode_WRITE(kind, to, n, fill_char);
10590 }
10591 else {
10592 /* number of characters copied this far */
10593 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10594 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10595 char *to = (char *) PyUnicode_DATA(u);
10596 Py_MEMCPY(to, PyUnicode_DATA(str),
10597 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010598 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 n = (done <= nchars-done) ? done : nchars-done;
10600 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010601 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603 }
10604
10605 return (PyObject*) u;
10606}
10607
Alexander Belopolsky40018472011-02-26 01:02:56 +000010608PyObject *
10609PyUnicode_Replace(PyObject *obj,
10610 PyObject *subobj,
10611 PyObject *replobj,
10612 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613{
10614 PyObject *self;
10615 PyObject *str1;
10616 PyObject *str2;
10617 PyObject *result;
10618
10619 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010624 Py_DECREF(self);
10625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626 }
10627 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010629 Py_DECREF(self);
10630 Py_DECREF(str1);
10631 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010634 Py_DECREF(self);
10635 Py_DECREF(str1);
10636 Py_DECREF(str2);
10637 return result;
10638}
10639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010640PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010641 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642\n\
10643Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010644old replaced by new. If the optional argument count is\n\
10645given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646
10647static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 PyObject *str1;
10651 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010652 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010653 PyObject *result;
10654
Martin v. Löwis18e16552006-02-15 17:27:45 +000010655 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010656 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010658 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 str1 = PyUnicode_FromObject(str1);
10660 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10661 return NULL;
10662 str2 = PyUnicode_FromObject(str2);
10663 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010664 Py_DECREF(str1);
10665 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010667
10668 result = replace(self, str1, str2, maxcount);
10669
10670 Py_DECREF(str1);
10671 Py_DECREF(str2);
10672 return result;
10673}
10674
Alexander Belopolsky40018472011-02-26 01:02:56 +000010675static PyObject *
10676unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010678 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 Py_ssize_t isize;
10680 Py_ssize_t osize, squote, dquote, i, o;
10681 Py_UCS4 max, quote;
10682 int ikind, okind;
10683 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010686 return NULL;
10687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 isize = PyUnicode_GET_LENGTH(unicode);
10689 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010690
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 /* Compute length of output, quote characters, and
10692 maximum character */
10693 osize = 2; /* quotes */
10694 max = 127;
10695 squote = dquote = 0;
10696 ikind = PyUnicode_KIND(unicode);
10697 for (i = 0; i < isize; i++) {
10698 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10699 switch (ch) {
10700 case '\'': squote++; osize++; break;
10701 case '"': dquote++; osize++; break;
10702 case '\\': case '\t': case '\r': case '\n':
10703 osize += 2; break;
10704 default:
10705 /* Fast-path ASCII */
10706 if (ch < ' ' || ch == 0x7f)
10707 osize += 4; /* \xHH */
10708 else if (ch < 0x7f)
10709 osize++;
10710 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10711 osize++;
10712 max = ch > max ? ch : max;
10713 }
10714 else if (ch < 0x100)
10715 osize += 4; /* \xHH */
10716 else if (ch < 0x10000)
10717 osize += 6; /* \uHHHH */
10718 else
10719 osize += 10; /* \uHHHHHHHH */
10720 }
10721 }
10722
10723 quote = '\'';
10724 if (squote) {
10725 if (dquote)
10726 /* Both squote and dquote present. Use squote,
10727 and escape them */
10728 osize += squote;
10729 else
10730 quote = '"';
10731 }
10732
10733 repr = PyUnicode_New(osize, max);
10734 if (repr == NULL)
10735 return NULL;
10736 okind = PyUnicode_KIND(repr);
10737 odata = PyUnicode_DATA(repr);
10738
10739 PyUnicode_WRITE(okind, odata, 0, quote);
10740 PyUnicode_WRITE(okind, odata, osize-1, quote);
10741
10742 for (i = 0, o = 1; i < isize; i++) {
10743 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010744
10745 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 if ((ch == quote) || (ch == '\\')) {
10747 PyUnicode_WRITE(okind, odata, o++, '\\');
10748 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010749 continue;
10750 }
10751
Benjamin Peterson29060642009-01-31 22:14:21 +000010752 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010753 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 PyUnicode_WRITE(okind, odata, o++, '\\');
10755 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010756 }
10757 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 PyUnicode_WRITE(okind, odata, o++, '\\');
10759 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010760 }
10761 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 PyUnicode_WRITE(okind, odata, o++, '\\');
10763 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010764 }
10765
10766 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010767 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 PyUnicode_WRITE(okind, odata, o++, '\\');
10769 PyUnicode_WRITE(okind, odata, o++, 'x');
10770 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10771 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010772 }
10773
Georg Brandl559e5d72008-06-11 18:37:52 +000010774 /* Copy ASCII characters as-is */
10775 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010777 }
10778
Benjamin Peterson29060642009-01-31 22:14:21 +000010779 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010780 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010781 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010782 (categories Z* and C* except ASCII space)
10783 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010785 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 if (ch <= 0xff) {
10787 PyUnicode_WRITE(okind, odata, o++, '\\');
10788 PyUnicode_WRITE(okind, odata, o++, 'x');
10789 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10790 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010791 }
10792 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 else if (ch >= 0x10000) {
10794 PyUnicode_WRITE(okind, odata, o++, '\\');
10795 PyUnicode_WRITE(okind, odata, o++, 'U');
10796 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10797 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10798 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10799 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10800 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10801 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10802 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10803 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010804 }
10805 /* Map 16-bit characters to '\uxxxx' */
10806 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 PyUnicode_WRITE(okind, odata, o++, '\\');
10808 PyUnicode_WRITE(okind, odata, o++, 'u');
10809 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10810 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10811 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10812 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010813 }
10814 }
10815 /* Copy characters as-is */
10816 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010818 }
10819 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010822 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823}
10824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010825PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010826 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827\n\
10828Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010829such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830arguments start and end are interpreted as in slice notation.\n\
10831\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010832Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833
10834static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836{
Jesus Ceaac451502011-04-20 17:09:23 +020010837 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010838 Py_ssize_t start;
10839 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010840 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
Jesus Ceaac451502011-04-20 17:09:23 +020010842 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10843 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010844 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 if (PyUnicode_READY(self) == -1)
10847 return NULL;
10848 if (PyUnicode_READY(substring) == -1)
10849 return NULL;
10850
10851 result = any_find_slice(
10852 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10853 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010854 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855
10856 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 if (result == -2)
10859 return NULL;
10860
Christian Heimes217cfd12007-12-02 14:31:20 +000010861 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862}
10863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010864PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010865 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010867Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868
10869static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871{
Jesus Ceaac451502011-04-20 17:09:23 +020010872 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010873 Py_ssize_t start;
10874 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010875 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876
Jesus Ceaac451502011-04-20 17:09:23 +020010877 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10878 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 if (PyUnicode_READY(self) == -1)
10882 return NULL;
10883 if (PyUnicode_READY(substring) == -1)
10884 return NULL;
10885
10886 result = any_find_slice(
10887 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10888 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010889 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
10891 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 if (result == -2)
10894 return NULL;
10895
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896 if (result < 0) {
10897 PyErr_SetString(PyExc_ValueError, "substring not found");
10898 return NULL;
10899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900
Christian Heimes217cfd12007-12-02 14:31:20 +000010901 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010902}
10903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010904PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010905 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010907Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010908done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909
10910static PyObject *
10911unicode_rjust(PyUnicodeObject *self, PyObject *args)
10912{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010913 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 Py_UCS4 fillchar = ' ';
10915
10916 if (PyUnicode_READY(self) == -1)
10917 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010918
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010919 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920 return NULL;
10921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923 Py_INCREF(self);
10924 return (PyObject*) self;
10925 }
10926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928}
10929
Alexander Belopolsky40018472011-02-26 01:02:56 +000010930PyObject *
10931PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932{
10933 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010934
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935 s = PyUnicode_FromObject(s);
10936 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010937 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010938 if (sep != NULL) {
10939 sep = PyUnicode_FromObject(sep);
10940 if (sep == NULL) {
10941 Py_DECREF(s);
10942 return NULL;
10943 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944 }
10945
10946 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10947
10948 Py_DECREF(s);
10949 Py_XDECREF(sep);
10950 return result;
10951}
10952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010953PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010954 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955\n\
10956Return a list of the words in S, using sep as the\n\
10957delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000010958splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000010959whitespace string is a separator and empty strings are\n\
10960removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961
10962static PyObject*
10963unicode_split(PyUnicodeObject *self, PyObject *args)
10964{
10965 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010966 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967
Martin v. Löwis18e16552006-02-15 17:27:45 +000010968 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969 return NULL;
10970
10971 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000010972 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000010974 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975 else
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977}
10978
Thomas Wouters477c8d52006-05-27 19:21:47 +000010979PyObject *
10980PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
10981{
10982 PyObject* str_obj;
10983 PyObject* sep_obj;
10984 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985 int kind1, kind2, kind;
10986 void *buf1 = NULL, *buf2 = NULL;
10987 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010988
10989 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010992 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010994 Py_DECREF(str_obj);
10995 return NULL;
10996 }
10997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 kind1 = PyUnicode_KIND(str_in);
10999 kind2 = PyUnicode_KIND(sep_obj);
11000 kind = kind1 > kind2 ? kind1 : kind2;
11001 buf1 = PyUnicode_DATA(str_in);
11002 if (kind1 != kind)
11003 buf1 = _PyUnicode_AsKind(str_in, kind);
11004 if (!buf1)
11005 goto onError;
11006 buf2 = PyUnicode_DATA(sep_obj);
11007 if (kind2 != kind)
11008 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11009 if (!buf2)
11010 goto onError;
11011 len1 = PyUnicode_GET_LENGTH(str_obj);
11012 len2 = PyUnicode_GET_LENGTH(sep_obj);
11013
11014 switch(PyUnicode_KIND(str_in)) {
11015 case PyUnicode_1BYTE_KIND:
11016 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11017 break;
11018 case PyUnicode_2BYTE_KIND:
11019 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11020 break;
11021 case PyUnicode_4BYTE_KIND:
11022 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11023 break;
11024 default:
11025 assert(0);
11026 out = 0;
11027 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011028
11029 Py_DECREF(sep_obj);
11030 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031 if (kind1 != kind)
11032 PyMem_Free(buf1);
11033 if (kind2 != kind)
11034 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011035
11036 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 onError:
11038 Py_DECREF(sep_obj);
11039 Py_DECREF(str_obj);
11040 if (kind1 != kind && buf1)
11041 PyMem_Free(buf1);
11042 if (kind2 != kind && buf2)
11043 PyMem_Free(buf2);
11044 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011045}
11046
11047
11048PyObject *
11049PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11050{
11051 PyObject* str_obj;
11052 PyObject* sep_obj;
11053 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054 int kind1, kind2, kind;
11055 void *buf1 = NULL, *buf2 = NULL;
11056 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011057
11058 str_obj = PyUnicode_FromObject(str_in);
11059 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011060 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011061 sep_obj = PyUnicode_FromObject(sep_in);
11062 if (!sep_obj) {
11063 Py_DECREF(str_obj);
11064 return NULL;
11065 }
11066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 kind1 = PyUnicode_KIND(str_in);
11068 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011069 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070 buf1 = PyUnicode_DATA(str_in);
11071 if (kind1 != kind)
11072 buf1 = _PyUnicode_AsKind(str_in, kind);
11073 if (!buf1)
11074 goto onError;
11075 buf2 = PyUnicode_DATA(sep_obj);
11076 if (kind2 != kind)
11077 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11078 if (!buf2)
11079 goto onError;
11080 len1 = PyUnicode_GET_LENGTH(str_obj);
11081 len2 = PyUnicode_GET_LENGTH(sep_obj);
11082
11083 switch(PyUnicode_KIND(str_in)) {
11084 case PyUnicode_1BYTE_KIND:
11085 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11086 break;
11087 case PyUnicode_2BYTE_KIND:
11088 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11089 break;
11090 case PyUnicode_4BYTE_KIND:
11091 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11092 break;
11093 default:
11094 assert(0);
11095 out = 0;
11096 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011097
11098 Py_DECREF(sep_obj);
11099 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 if (kind1 != kind)
11101 PyMem_Free(buf1);
11102 if (kind2 != kind)
11103 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011104
11105 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011106 onError:
11107 Py_DECREF(sep_obj);
11108 Py_DECREF(str_obj);
11109 if (kind1 != kind && buf1)
11110 PyMem_Free(buf1);
11111 if (kind2 != kind && buf2)
11112 PyMem_Free(buf2);
11113 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011114}
11115
11116PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011118\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011119Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011120the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011121found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011122
11123static PyObject*
11124unicode_partition(PyUnicodeObject *self, PyObject *separator)
11125{
11126 return PyUnicode_Partition((PyObject *)self, separator);
11127}
11128
11129PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011130 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011131\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011132Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011133the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011134separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011135
11136static PyObject*
11137unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11138{
11139 return PyUnicode_RPartition((PyObject *)self, separator);
11140}
11141
Alexander Belopolsky40018472011-02-26 01:02:56 +000011142PyObject *
11143PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011144{
11145 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011146
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011147 s = PyUnicode_FromObject(s);
11148 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011149 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011150 if (sep != NULL) {
11151 sep = PyUnicode_FromObject(sep);
11152 if (sep == NULL) {
11153 Py_DECREF(s);
11154 return NULL;
11155 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011156 }
11157
11158 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11159
11160 Py_DECREF(s);
11161 Py_XDECREF(sep);
11162 return result;
11163}
11164
11165PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011166 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011167\n\
11168Return a list of the words in S, using sep as the\n\
11169delimiter string, starting at the end of the string and\n\
11170working to the front. If maxsplit is given, at most maxsplit\n\
11171splits are done. If sep is not specified, any whitespace string\n\
11172is a separator.");
11173
11174static PyObject*
11175unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11176{
11177 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011178 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011179
Martin v. Löwis18e16552006-02-15 17:27:45 +000011180 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011181 return NULL;
11182
11183 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011184 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011185 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011186 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011187 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011188 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011189}
11190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011191PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011192 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193\n\
11194Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011195Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011196is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197
11198static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011199unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011201 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011202 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011204 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11205 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206 return NULL;
11207
Guido van Rossum86662912000-04-11 15:38:46 +000011208 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209}
11210
11211static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011212PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213{
Walter Dörwald346737f2007-05-31 10:44:43 +000011214 if (PyUnicode_CheckExact(self)) {
11215 Py_INCREF(self);
11216 return self;
11217 } else
11218 /* Subtype -- return genuine unicode string with the same value. */
11219 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
11220 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221}
11222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011223PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011224 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225\n\
11226Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011227and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228
11229static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011230unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232 return fixup(self, fixswapcase);
11233}
11234
Georg Brandlceee0772007-11-27 23:48:05 +000011235PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011237\n\
11238Return a translation table usable for str.translate().\n\
11239If there is only one argument, it must be a dictionary mapping Unicode\n\
11240ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011241Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011242If there are two arguments, they must be strings of equal length, and\n\
11243in the resulting dictionary, each character in x will be mapped to the\n\
11244character at the same position in y. If there is a third argument, it\n\
11245must be a string, whose characters will be mapped to None in the result.");
11246
11247static PyObject*
11248unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11249{
11250 PyObject *x, *y = NULL, *z = NULL;
11251 PyObject *new = NULL, *key, *value;
11252 Py_ssize_t i = 0;
11253 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011254
Georg Brandlceee0772007-11-27 23:48:05 +000011255 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11256 return NULL;
11257 new = PyDict_New();
11258 if (!new)
11259 return NULL;
11260 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 int x_kind, y_kind, z_kind;
11262 void *x_data, *y_data, *z_data;
11263
Georg Brandlceee0772007-11-27 23:48:05 +000011264 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011265 if (!PyUnicode_Check(x)) {
11266 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11267 "be a string if there is a second argument");
11268 goto err;
11269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011271 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11272 "arguments must have equal length");
11273 goto err;
11274 }
11275 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011276 x_kind = PyUnicode_KIND(x);
11277 y_kind = PyUnicode_KIND(y);
11278 x_data = PyUnicode_DATA(x);
11279 y_data = PyUnicode_DATA(y);
11280 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11281 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11282 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011283 if (!key || !value)
11284 goto err;
11285 res = PyDict_SetItem(new, key, value);
11286 Py_DECREF(key);
11287 Py_DECREF(value);
11288 if (res < 0)
11289 goto err;
11290 }
11291 /* create entries for deleting chars in z */
11292 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011293 z_kind = PyUnicode_KIND(z);
11294 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011295 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011297 if (!key)
11298 goto err;
11299 res = PyDict_SetItem(new, key, Py_None);
11300 Py_DECREF(key);
11301 if (res < 0)
11302 goto err;
11303 }
11304 }
11305 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 int kind;
11307 void *data;
11308
Georg Brandlceee0772007-11-27 23:48:05 +000011309 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011310 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011311 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11312 "to maketrans it must be a dict");
11313 goto err;
11314 }
11315 /* copy entries into the new dict, converting string keys to int keys */
11316 while (PyDict_Next(x, &i, &key, &value)) {
11317 if (PyUnicode_Check(key)) {
11318 /* convert string keys to integer keys */
11319 PyObject *newkey;
11320 if (PyUnicode_GET_SIZE(key) != 1) {
11321 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11322 "table must be of length 1");
11323 goto err;
11324 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 kind = PyUnicode_KIND(key);
11326 data = PyUnicode_DATA(key);
11327 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011328 if (!newkey)
11329 goto err;
11330 res = PyDict_SetItem(new, newkey, value);
11331 Py_DECREF(newkey);
11332 if (res < 0)
11333 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011334 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011335 /* just keep integer keys */
11336 if (PyDict_SetItem(new, key, value) < 0)
11337 goto err;
11338 } else {
11339 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11340 "be strings or integers");
11341 goto err;
11342 }
11343 }
11344 }
11345 return new;
11346 err:
11347 Py_DECREF(new);
11348 return NULL;
11349}
11350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011351PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011352 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353\n\
11354Return a copy of the string S, where all characters have been mapped\n\
11355through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011356Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011357Unmapped characters are left untouched. Characters mapped to None\n\
11358are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359
11360static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364}
11365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011366PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011367 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011369Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370
11371static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011372unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374 return fixup(self, fixupper);
11375}
11376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011377PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011378 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011380Pad a numeric string S with zeros on the left, to fill a field\n\
11381of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382
11383static PyObject *
11384unicode_zfill(PyUnicodeObject *self, PyObject *args)
11385{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011386 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011388 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 int kind;
11390 void *data;
11391 Py_UCS4 chr;
11392
11393 if (PyUnicode_READY(self) == -1)
11394 return NULL;
11395
Martin v. Löwis18e16552006-02-15 17:27:45 +000011396 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397 return NULL;
11398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011400 if (PyUnicode_CheckExact(self)) {
11401 Py_INCREF(self);
11402 return (PyObject*) self;
11403 }
11404 else
11405 return PyUnicode_FromUnicode(
11406 PyUnicode_AS_UNICODE(self),
11407 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +000011408 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409 }
11410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412
11413 u = pad(self, fill, 0, '0');
11414
Walter Dörwald068325e2002-04-15 13:36:47 +000011415 if (u == NULL)
11416 return NULL;
11417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418 kind = PyUnicode_KIND(u);
11419 data = PyUnicode_DATA(u);
11420 chr = PyUnicode_READ(kind, data, fill);
11421
11422 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 PyUnicode_WRITE(kind, data, 0, chr);
11425 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426 }
11427
11428 return (PyObject*) u;
11429}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
11431#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011432static PyObject *
11433unicode__decimal2ascii(PyObject *self)
11434{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011436}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437#endif
11438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011439PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011440 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011442Return True if S starts with the specified prefix, False otherwise.\n\
11443With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011444With optional end, stop comparing S at that position.\n\
11445prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446
11447static PyObject *
11448unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011449 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011451 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011453 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011454 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011455 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456
Jesus Ceaac451502011-04-20 17:09:23 +020011457 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011458 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011459 if (PyTuple_Check(subobj)) {
11460 Py_ssize_t i;
11461 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11462 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011463 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011464 if (substring == NULL)
11465 return NULL;
11466 result = tailmatch(self, substring, start, end, -1);
11467 Py_DECREF(substring);
11468 if (result) {
11469 Py_RETURN_TRUE;
11470 }
11471 }
11472 /* nothing matched */
11473 Py_RETURN_FALSE;
11474 }
11475 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011476 if (substring == NULL) {
11477 if (PyErr_ExceptionMatches(PyExc_TypeError))
11478 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11479 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011481 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011482 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011484 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485}
11486
11487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011488PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011491Return True if S ends with the specified suffix, False otherwise.\n\
11492With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011493With optional end, stop comparing S at that position.\n\
11494suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
11496static PyObject *
11497unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011498 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011500 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011502 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011503 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011504 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505
Jesus Ceaac451502011-04-20 17:09:23 +020011506 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011508 if (PyTuple_Check(subobj)) {
11509 Py_ssize_t i;
11510 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11511 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011513 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011515 result = tailmatch(self, substring, start, end, +1);
11516 Py_DECREF(substring);
11517 if (result) {
11518 Py_RETURN_TRUE;
11519 }
11520 }
11521 Py_RETURN_FALSE;
11522 }
11523 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011524 if (substring == NULL) {
11525 if (PyErr_ExceptionMatches(PyExc_TypeError))
11526 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11527 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011528 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011529 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011530 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011532 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533}
11534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011536
11537PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011538 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011539\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011540Return a formatted version of S, using substitutions from args and kwargs.\n\
11541The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011542
Eric Smith27bbca62010-11-04 17:06:58 +000011543PyDoc_STRVAR(format_map__doc__,
11544 "S.format_map(mapping) -> str\n\
11545\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011546Return a formatted version of S, using substitutions from mapping.\n\
11547The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011548
Eric Smith4a7d76d2008-05-30 18:10:19 +000011549static PyObject *
11550unicode__format__(PyObject* self, PyObject* args)
11551{
11552 PyObject *format_spec;
11553
11554 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11555 return NULL;
11556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11558 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011559}
11560
Eric Smith8c663262007-08-25 02:26:07 +000011561PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011563\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011564Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011565
11566static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011567unicode__sizeof__(PyUnicodeObject *v)
11568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 Py_ssize_t size;
11570
11571 /* If it's a compact object, account for base structure +
11572 character data. */
11573 if (PyUnicode_IS_COMPACT_ASCII(v))
11574 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11575 else if (PyUnicode_IS_COMPACT(v))
11576 size = sizeof(PyCompactUnicodeObject) +
11577 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11578 else {
11579 /* If it is a two-block object, account for base object, and
11580 for character block if present. */
11581 size = sizeof(PyUnicodeObject);
11582 if (v->data.any)
11583 size += (PyUnicode_GET_LENGTH(v) + 1) *
11584 PyUnicode_CHARACTER_SIZE(v);
11585 }
11586 /* If the wstr pointer is present, account for it unless it is shared
11587 with the data pointer. Since PyUnicode_DATA will crash if the object
11588 is not ready, check whether it's either not ready (in which case the
11589 data is entirely in wstr) or if the data is not shared. */
11590 if (_PyUnicode_WSTR(v) &&
11591 (!PyUnicode_IS_READY(v) ||
11592 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11593 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11594 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11595 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11596
11597 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011598}
11599
11600PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011602
11603static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011604unicode_getnewargs(PyUnicodeObject *v)
11605{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606 PyObject *copy;
11607 unsigned char *data;
11608 int kind;
11609 if (PyUnicode_READY(v) == -1)
11610 return NULL;
11611 kind = PyUnicode_KIND(v);
11612 data = PyUnicode_1BYTE_DATA(v);
11613 copy = PyUnicode_FromKindAndData(kind, data, PyUnicode_GET_LENGTH(v));
11614 if (!copy)
11615 return NULL;
11616 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011617}
11618
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619static PyMethodDef unicode_methods[] = {
11620
11621 /* Order is according to common usage: often used methods should
11622 appear first, since lookup is done sequentially. */
11623
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011624 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011625 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11626 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011627 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011628 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11629 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11630 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11631 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11632 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11633 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11634 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011635 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011636 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11637 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11638 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011639 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011640 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11641 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11642 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011643 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011644 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011645 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011646 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011647 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11648 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11649 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11650 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11651 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11652 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11653 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11654 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11655 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11656 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11657 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11658 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11659 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11660 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011661 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011662 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011663 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011664 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011665 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011666 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011667 {"maketrans", (PyCFunction) unicode_maketrans,
11668 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011669 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011670#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011671 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672#endif
11673
11674#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011675 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011676 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677#endif
11678
Benjamin Peterson14339b62009-01-31 16:36:08 +000011679 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680 {NULL, NULL}
11681};
11682
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011683static PyObject *
11684unicode_mod(PyObject *v, PyObject *w)
11685{
Brian Curtindfc80e32011-08-10 20:28:54 -050011686 if (!PyUnicode_Check(v))
11687 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011688 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011689}
11690
11691static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011692 0, /*nb_add*/
11693 0, /*nb_subtract*/
11694 0, /*nb_multiply*/
11695 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011696};
11697
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011699 (lenfunc) unicode_length, /* sq_length */
11700 PyUnicode_Concat, /* sq_concat */
11701 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11702 (ssizeargfunc) unicode_getitem, /* sq_item */
11703 0, /* sq_slice */
11704 0, /* sq_ass_item */
11705 0, /* sq_ass_slice */
11706 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707};
11708
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011709static PyObject*
11710unicode_subscript(PyUnicodeObject* self, PyObject* item)
11711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 if (PyUnicode_READY(self) == -1)
11713 return NULL;
11714
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011715 if (PyIndex_Check(item)) {
11716 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011717 if (i == -1 && PyErr_Occurred())
11718 return NULL;
11719 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011721 return unicode_getitem(self, i);
11722 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011723 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011724 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011725 Py_UNICODE* result_buf;
11726 PyObject* result;
11727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011729 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011730 return NULL;
11731 }
11732
11733 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 return PyUnicode_New(0, 0);
11735 } else if (start == 0 && step == 1 &&
11736 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011737 PyUnicode_CheckExact(self)) {
11738 Py_INCREF(self);
11739 return (PyObject *)self;
11740 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011742 } else {
11743 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011744 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11745 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011746
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 if (result_buf == NULL)
11748 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011749
11750 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11751 result_buf[i] = source_buf[cur];
11752 }
Tim Petersced69f82003-09-16 20:30:58 +000011753
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011754 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011755 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011756 return result;
11757 }
11758 } else {
11759 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11760 return NULL;
11761 }
11762}
11763
11764static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011765 (lenfunc)unicode_length, /* mp_length */
11766 (binaryfunc)unicode_subscript, /* mp_subscript */
11767 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011768};
11769
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771/* Helpers for PyUnicode_Format() */
11772
11773static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011774getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011776 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011778 (*p_argidx)++;
11779 if (arglen < 0)
11780 return args;
11781 else
11782 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783 }
11784 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011785 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786 return NULL;
11787}
11788
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011789/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011791static PyObject *
11792formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011794 char *p;
11795 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011797
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798 x = PyFloat_AsDouble(v);
11799 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011800 return NULL;
11801
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011803 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011804
Eric Smith0923d1d2009-04-16 20:16:10 +000011805 p = PyOS_double_to_string(x, type, prec,
11806 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011807 if (p == NULL)
11808 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011810 PyMem_Free(p);
11811 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812}
11813
Tim Peters38fd5b62000-09-21 05:43:11 +000011814static PyObject*
11815formatlong(PyObject *val, int flags, int prec, int type)
11816{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011817 char *buf;
11818 int len;
11819 PyObject *str; /* temporary string object. */
11820 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011821
Benjamin Peterson14339b62009-01-31 16:36:08 +000011822 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11823 if (!str)
11824 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011826 Py_DECREF(str);
11827 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011828}
11829
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011832 size_t buflen,
11833 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011835 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011836 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 if (PyUnicode_GET_LENGTH(v) == 1) {
11838 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011839 buf[1] = '\0';
11840 return 1;
11841 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011842 goto onError;
11843 }
11844 else {
11845 /* Integer input truncated to a character */
11846 long x;
11847 x = PyLong_AsLong(v);
11848 if (x == -1 && PyErr_Occurred())
11849 goto onError;
11850
11851 if (x < 0 || x > 0x10ffff) {
11852 PyErr_SetString(PyExc_OverflowError,
11853 "%c arg not in range(0x110000)");
11854 return -1;
11855 }
11856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011858 buf[1] = '\0';
11859 return 1;
11860 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011861
Benjamin Peterson29060642009-01-31 22:14:21 +000011862 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011863 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011865 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866}
11867
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011868/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011869 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011870*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011871#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011872
Alexander Belopolsky40018472011-02-26 01:02:56 +000011873PyObject *
11874PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 void *fmt;
11877 int fmtkind;
11878 PyObject *result;
11879 Py_UCS4 *res, *res0;
11880 Py_UCS4 max;
11881 int kind;
11882 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011886
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011888 PyErr_BadInternalCall();
11889 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11892 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011893 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 fmt = PyUnicode_DATA(uformat);
11895 fmtkind = PyUnicode_KIND(uformat);
11896 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11897 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898
11899 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11901 if (res0 == NULL) {
11902 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011903 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905
11906 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011907 arglen = PyTuple_Size(args);
11908 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909 }
11910 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011911 arglen = -1;
11912 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011914 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011915 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011916 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917
11918 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011920 if (--rescnt < 0) {
11921 rescnt = fmtcnt + 100;
11922 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11924 if (res0 == NULL){
11925 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011926 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 }
11928 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011930 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011932 }
11933 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 /* Got a format specifier */
11935 int flags = 0;
11936 Py_ssize_t width = -1;
11937 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 Py_UCS4 c = '\0';
11939 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 int isnumok;
11941 PyObject *v = NULL;
11942 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 void *pbuf;
11944 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 Py_ssize_t len, len1;
11947 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 fmtpos++;
11950 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
11951 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000011952 Py_ssize_t keylen;
11953 PyObject *key;
11954 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000011955
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 if (dict == NULL) {
11957 PyErr_SetString(PyExc_TypeError,
11958 "format requires a mapping");
11959 goto onError;
11960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 /* Skip over balanced parentheses */
11965 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000011967 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011973 if (fmtcnt < 0 || pcount > 0) {
11974 PyErr_SetString(PyExc_ValueError,
11975 "incomplete format key");
11976 goto onError;
11977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000011979 if (key == NULL)
11980 goto onError;
11981 if (args_owned) {
11982 Py_DECREF(args);
11983 args_owned = 0;
11984 }
11985 args = PyObject_GetItem(dict, key);
11986 Py_DECREF(key);
11987 if (args == NULL) {
11988 goto onError;
11989 }
11990 args_owned = 1;
11991 arglen = -1;
11992 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011993 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 case '-': flags |= F_LJUST; continue;
11997 case '+': flags |= F_SIGN; continue;
11998 case ' ': flags |= F_BLANK; continue;
11999 case '#': flags |= F_ALT; continue;
12000 case '0': flags |= F_ZERO; continue;
12001 }
12002 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012003 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 if (c == '*') {
12005 v = getnextarg(args, arglen, &argidx);
12006 if (v == NULL)
12007 goto onError;
12008 if (!PyLong_Check(v)) {
12009 PyErr_SetString(PyExc_TypeError,
12010 "* wants int");
12011 goto onError;
12012 }
12013 width = PyLong_AsLong(v);
12014 if (width == -1 && PyErr_Occurred())
12015 goto onError;
12016 if (width < 0) {
12017 flags |= F_LJUST;
12018 width = -width;
12019 }
12020 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 }
12023 else if (c >= '0' && c <= '9') {
12024 width = c - '0';
12025 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012027 if (c < '0' || c > '9')
12028 break;
12029 if ((width*10) / 10 != width) {
12030 PyErr_SetString(PyExc_ValueError,
12031 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012032 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012033 }
12034 width = width*10 + (c - '0');
12035 }
12036 }
12037 if (c == '.') {
12038 prec = 0;
12039 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012041 if (c == '*') {
12042 v = getnextarg(args, arglen, &argidx);
12043 if (v == NULL)
12044 goto onError;
12045 if (!PyLong_Check(v)) {
12046 PyErr_SetString(PyExc_TypeError,
12047 "* wants int");
12048 goto onError;
12049 }
12050 prec = PyLong_AsLong(v);
12051 if (prec == -1 && PyErr_Occurred())
12052 goto onError;
12053 if (prec < 0)
12054 prec = 0;
12055 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012057 }
12058 else if (c >= '0' && c <= '9') {
12059 prec = c - '0';
12060 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012062 if (c < '0' || c > '9')
12063 break;
12064 if ((prec*10) / 10 != prec) {
12065 PyErr_SetString(PyExc_ValueError,
12066 "prec too big");
12067 goto onError;
12068 }
12069 prec = prec*10 + (c - '0');
12070 }
12071 }
12072 } /* prec */
12073 if (fmtcnt >= 0) {
12074 if (c == 'h' || c == 'l' || c == 'L') {
12075 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012077 }
12078 }
12079 if (fmtcnt < 0) {
12080 PyErr_SetString(PyExc_ValueError,
12081 "incomplete format");
12082 goto onError;
12083 }
12084 if (c != '%') {
12085 v = getnextarg(args, arglen, &argidx);
12086 if (v == NULL)
12087 goto onError;
12088 }
12089 sign = 0;
12090 fill = ' ';
12091 switch (c) {
12092
12093 case '%':
12094 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012096 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 len = 1;
12099 break;
12100
12101 case 's':
12102 case 'r':
12103 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012104 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012105 temp = v;
12106 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012107 }
12108 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 if (c == 's')
12110 temp = PyObject_Str(v);
12111 else if (c == 'r')
12112 temp = PyObject_Repr(v);
12113 else
12114 temp = PyObject_ASCII(v);
12115 if (temp == NULL)
12116 goto onError;
12117 if (PyUnicode_Check(temp))
12118 /* nothing to do */;
12119 else {
12120 Py_DECREF(temp);
12121 PyErr_SetString(PyExc_TypeError,
12122 "%s argument has non-string str()");
12123 goto onError;
12124 }
12125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 if (PyUnicode_READY(temp) == -1) {
12127 Py_CLEAR(temp);
12128 goto onError;
12129 }
12130 pbuf = PyUnicode_DATA(temp);
12131 kind = PyUnicode_KIND(temp);
12132 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012133 if (prec >= 0 && len > prec)
12134 len = prec;
12135 break;
12136
12137 case 'i':
12138 case 'd':
12139 case 'u':
12140 case 'o':
12141 case 'x':
12142 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012143 isnumok = 0;
12144 if (PyNumber_Check(v)) {
12145 PyObject *iobj=NULL;
12146
12147 if (PyLong_Check(v)) {
12148 iobj = v;
12149 Py_INCREF(iobj);
12150 }
12151 else {
12152 iobj = PyNumber_Long(v);
12153 }
12154 if (iobj!=NULL) {
12155 if (PyLong_Check(iobj)) {
12156 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012157 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012158 Py_DECREF(iobj);
12159 if (!temp)
12160 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 if (PyUnicode_READY(temp) == -1) {
12162 Py_CLEAR(temp);
12163 goto onError;
12164 }
12165 pbuf = PyUnicode_DATA(temp);
12166 kind = PyUnicode_KIND(temp);
12167 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012168 sign = 1;
12169 }
12170 else {
12171 Py_DECREF(iobj);
12172 }
12173 }
12174 }
12175 if (!isnumok) {
12176 PyErr_Format(PyExc_TypeError,
12177 "%%%c format: a number is required, "
12178 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12179 goto onError;
12180 }
12181 if (flags & F_ZERO)
12182 fill = '0';
12183 break;
12184
12185 case 'e':
12186 case 'E':
12187 case 'f':
12188 case 'F':
12189 case 'g':
12190 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012191 temp = formatfloat(v, flags, prec, c);
12192 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012193 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 if (PyUnicode_READY(temp) == -1) {
12195 Py_CLEAR(temp);
12196 goto onError;
12197 }
12198 pbuf = PyUnicode_DATA(temp);
12199 kind = PyUnicode_KIND(temp);
12200 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012201 sign = 1;
12202 if (flags & F_ZERO)
12203 fill = '0';
12204 break;
12205
12206 case 'c':
12207 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012209 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
12210 if (len < 0)
12211 goto onError;
12212 break;
12213
12214 default:
12215 PyErr_Format(PyExc_ValueError,
12216 "unsupported format character '%c' (0x%x) "
12217 "at index %zd",
12218 (31<=c && c<=126) ? (char)c : '?',
12219 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012221 goto onError;
12222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 /* pbuf is initialized here. */
12224 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012225 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12227 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12228 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012229 len--;
12230 }
12231 else if (flags & F_SIGN)
12232 sign = '+';
12233 else if (flags & F_BLANK)
12234 sign = ' ';
12235 else
12236 sign = 0;
12237 }
12238 if (width < len)
12239 width = len;
12240 if (rescnt - (sign != 0) < width) {
12241 reslen -= rescnt;
12242 rescnt = width + fmtcnt + 100;
12243 reslen += rescnt;
12244 if (reslen < 0) {
12245 Py_XDECREF(temp);
12246 PyErr_NoMemory();
12247 goto onError;
12248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12250 if (res0 == 0) {
12251 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 Py_XDECREF(temp);
12253 goto onError;
12254 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012256 }
12257 if (sign) {
12258 if (fill != ' ')
12259 *res++ = sign;
12260 rescnt--;
12261 if (width > len)
12262 width--;
12263 }
12264 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12266 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012267 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12269 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012270 }
12271 rescnt -= 2;
12272 width -= 2;
12273 if (width < 0)
12274 width = 0;
12275 len -= 2;
12276 }
12277 if (width > len && !(flags & F_LJUST)) {
12278 do {
12279 --rescnt;
12280 *res++ = fill;
12281 } while (--width > len);
12282 }
12283 if (fill == ' ') {
12284 if (sign)
12285 *res++ = sign;
12286 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012287 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12288 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12289 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12290 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012291 }
12292 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 /* Copy all characters, preserving len */
12294 len1 = len;
12295 while (len1--) {
12296 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12297 rescnt--;
12298 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012299 while (--width >= len) {
12300 --rescnt;
12301 *res++ = ' ';
12302 }
12303 if (dict && (argidx < arglen) && c != '%') {
12304 PyErr_SetString(PyExc_TypeError,
12305 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012306 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 goto onError;
12308 }
12309 Py_XDECREF(temp);
12310 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311 } /* until end */
12312 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012313 PyErr_SetString(PyExc_TypeError,
12314 "not all arguments converted during string formatting");
12315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316 }
12317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318
12319 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12320 if (*res > max)
12321 max = *res;
12322 result = PyUnicode_New(reslen - rescnt, max);
12323 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012324 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 kind = PyUnicode_KIND(result);
12326 for (res = res0; res < res0+reslen-rescnt; res++)
12327 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12328 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331 }
12332 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333 return (PyObject *)result;
12334
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012337 Py_DECREF(uformat);
12338 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012339 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340 }
12341 return NULL;
12342}
12343
Jeremy Hylton938ace62002-07-17 16:30:39 +000012344static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012345unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12346
Tim Peters6d6c1a32001-08-02 04:15:00 +000012347static PyObject *
12348unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12349{
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012351 static char *kwlist[] = {"object", "encoding", "errors", 0};
12352 char *encoding = NULL;
12353 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012354
Benjamin Peterson14339b62009-01-31 16:36:08 +000012355 if (type != &PyUnicode_Type)
12356 return unicode_subtype_new(type, args, kwds);
12357 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012358 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012359 return NULL;
12360 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012362 if (encoding == NULL && errors == NULL)
12363 return PyObject_Str(x);
12364 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012365 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012366}
12367
Guido van Rossume023fe02001-08-30 03:12:59 +000012368static PyObject *
12369unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12370{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012371 PyUnicodeObject *tmp, *pnew;
12372 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012374
Benjamin Peterson14339b62009-01-31 16:36:08 +000012375 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12376 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12377 if (tmp == NULL)
12378 return NULL;
12379 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12381 // it seems kind of strange that tp_alloc gets passed the size
12382 // of the unicode string because there will follow another
12383 // malloc.
12384 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12385 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012386 if (pnew == NULL) {
12387 Py_DECREF(tmp);
12388 return NULL;
12389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12391 if (_PyUnicode_WSTR(pnew) == NULL) {
12392 err = PyErr_NoMemory();
12393 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012395 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12396 _PyUnicode_WSTR_LENGTH(pnew) = n;
12397 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12398 _PyUnicode_STATE(pnew).interned = 0;
12399 _PyUnicode_STATE(pnew).kind = 0;
12400 _PyUnicode_STATE(pnew).compact = 0;
12401 _PyUnicode_STATE(pnew).ready = 0;
12402 _PyUnicode_STATE(pnew).ascii = 0;
12403 pnew->data.any = NULL;
12404 _PyUnicode_LENGTH(pnew) = 0;
12405 pnew->_base.utf8 = NULL;
12406 pnew->_base.utf8_length = 0;
12407
12408 if (PyUnicode_READY(pnew) == -1) {
12409 PyObject_FREE(_PyUnicode_WSTR(pnew));
12410 goto onError;
12411 }
12412
Benjamin Peterson14339b62009-01-31 16:36:08 +000012413 Py_DECREF(tmp);
12414 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415
12416 onError:
12417 _Py_ForgetReference((PyObject *)pnew);
12418 PyObject_Del(pnew);
12419 Py_DECREF(tmp);
12420 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012421}
12422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012423PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012424 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012425\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012426Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012427encoding defaults to the current default string encoding.\n\
12428errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012429
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012430static PyObject *unicode_iter(PyObject *seq);
12431
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012433 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012434 "str", /* tp_name */
12435 sizeof(PyUnicodeObject), /* tp_size */
12436 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012438 (destructor)unicode_dealloc, /* tp_dealloc */
12439 0, /* tp_print */
12440 0, /* tp_getattr */
12441 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012442 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012443 unicode_repr, /* tp_repr */
12444 &unicode_as_number, /* tp_as_number */
12445 &unicode_as_sequence, /* tp_as_sequence */
12446 &unicode_as_mapping, /* tp_as_mapping */
12447 (hashfunc) unicode_hash, /* tp_hash*/
12448 0, /* tp_call*/
12449 (reprfunc) unicode_str, /* tp_str */
12450 PyObject_GenericGetAttr, /* tp_getattro */
12451 0, /* tp_setattro */
12452 0, /* tp_as_buffer */
12453 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012454 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012455 unicode_doc, /* tp_doc */
12456 0, /* tp_traverse */
12457 0, /* tp_clear */
12458 PyUnicode_RichCompare, /* tp_richcompare */
12459 0, /* tp_weaklistoffset */
12460 unicode_iter, /* tp_iter */
12461 0, /* tp_iternext */
12462 unicode_methods, /* tp_methods */
12463 0, /* tp_members */
12464 0, /* tp_getset */
12465 &PyBaseObject_Type, /* tp_base */
12466 0, /* tp_dict */
12467 0, /* tp_descr_get */
12468 0, /* tp_descr_set */
12469 0, /* tp_dictoffset */
12470 0, /* tp_init */
12471 0, /* tp_alloc */
12472 unicode_new, /* tp_new */
12473 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474};
12475
12476/* Initialize the Unicode implementation */
12477
Thomas Wouters78890102000-07-22 19:25:51 +000012478void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012480 int i;
12481
Thomas Wouters477c8d52006-05-27 19:21:47 +000012482 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012484 0x000A, /* LINE FEED */
12485 0x000D, /* CARRIAGE RETURN */
12486 0x001C, /* FILE SEPARATOR */
12487 0x001D, /* GROUP SEPARATOR */
12488 0x001E, /* RECORD SEPARATOR */
12489 0x0085, /* NEXT LINE */
12490 0x2028, /* LINE SEPARATOR */
12491 0x2029, /* PARAGRAPH SEPARATOR */
12492 };
12493
Fred Drakee4315f52000-05-09 19:53:39 +000012494 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012496 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012497 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012498
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012499 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012500 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012501 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012502 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012503
12504 /* initialize the linebreak bloom filter */
12505 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506 PyUnicode_2BYTE_KIND, linebreak,
12507 sizeof(linebreak) / sizeof(linebreak[0]));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012508
12509 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510}
12511
12512/* Finalize the Unicode implementation */
12513
Christian Heimesa156e092008-02-16 07:38:31 +000012514int
12515PyUnicode_ClearFreeList(void)
12516{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012518}
12519
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520void
Thomas Wouters78890102000-07-22 19:25:51 +000012521_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012523 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012525 Py_XDECREF(unicode_empty);
12526 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012527
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012528 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 if (unicode_latin1[i]) {
12530 Py_DECREF(unicode_latin1[i]);
12531 unicode_latin1[i] = NULL;
12532 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012533 }
Christian Heimesa156e092008-02-16 07:38:31 +000012534 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012536
Walter Dörwald16807132007-05-25 13:52:07 +000012537void
12538PyUnicode_InternInPlace(PyObject **p)
12539{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012540 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12541 PyObject *t;
12542 if (s == NULL || !PyUnicode_Check(s))
12543 Py_FatalError(
12544 "PyUnicode_InternInPlace: unicode strings only please!");
12545 /* If it's a subclass, we don't really know what putting
12546 it in the interned dict might do. */
12547 if (!PyUnicode_CheckExact(s))
12548 return;
12549 if (PyUnicode_CHECK_INTERNED(s))
12550 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 if (PyUnicode_READY(s) == -1) {
12552 assert(0 && "ready fail in intern...");
12553 return;
12554 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012555 if (interned == NULL) {
12556 interned = PyDict_New();
12557 if (interned == NULL) {
12558 PyErr_Clear(); /* Don't leave an exception */
12559 return;
12560 }
12561 }
12562 /* It might be that the GetItem call fails even
12563 though the key is present in the dictionary,
12564 namely when this happens during a stack overflow. */
12565 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012566 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012567 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012568
Benjamin Peterson29060642009-01-31 22:14:21 +000012569 if (t) {
12570 Py_INCREF(t);
12571 Py_DECREF(*p);
12572 *p = t;
12573 return;
12574 }
Walter Dörwald16807132007-05-25 13:52:07 +000012575
Benjamin Peterson14339b62009-01-31 16:36:08 +000012576 PyThreadState_GET()->recursion_critical = 1;
12577 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12578 PyErr_Clear();
12579 PyThreadState_GET()->recursion_critical = 0;
12580 return;
12581 }
12582 PyThreadState_GET()->recursion_critical = 0;
12583 /* The two references in interned are not counted by refcnt.
12584 The deallocator will take care of this */
12585 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012587}
12588
12589void
12590PyUnicode_InternImmortal(PyObject **p)
12591{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012592 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12593
Benjamin Peterson14339b62009-01-31 16:36:08 +000012594 PyUnicode_InternInPlace(p);
12595 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012597 Py_INCREF(*p);
12598 }
Walter Dörwald16807132007-05-25 13:52:07 +000012599}
12600
12601PyObject *
12602PyUnicode_InternFromString(const char *cp)
12603{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012604 PyObject *s = PyUnicode_FromString(cp);
12605 if (s == NULL)
12606 return NULL;
12607 PyUnicode_InternInPlace(&s);
12608 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012609}
12610
Alexander Belopolsky40018472011-02-26 01:02:56 +000012611void
12612_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012613{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012614 PyObject *keys;
12615 PyUnicodeObject *s;
12616 Py_ssize_t i, n;
12617 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012618
Benjamin Peterson14339b62009-01-31 16:36:08 +000012619 if (interned == NULL || !PyDict_Check(interned))
12620 return;
12621 keys = PyDict_Keys(interned);
12622 if (keys == NULL || !PyList_Check(keys)) {
12623 PyErr_Clear();
12624 return;
12625 }
Walter Dörwald16807132007-05-25 13:52:07 +000012626
Benjamin Peterson14339b62009-01-31 16:36:08 +000012627 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12628 detector, interned unicode strings are not forcibly deallocated;
12629 rather, we give them their stolen references back, and then clear
12630 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012631
Benjamin Peterson14339b62009-01-31 16:36:08 +000012632 n = PyList_GET_SIZE(keys);
12633 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012634 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012635 for (i = 0; i < n; i++) {
12636 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 if (PyUnicode_READY(s) == -1)
12638 fprintf(stderr, "could not ready string\n");
12639 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012640 case SSTATE_NOT_INTERNED:
12641 /* XXX Shouldn't happen */
12642 break;
12643 case SSTATE_INTERNED_IMMORTAL:
12644 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012646 break;
12647 case SSTATE_INTERNED_MORTAL:
12648 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012650 break;
12651 default:
12652 Py_FatalError("Inconsistent interned string state.");
12653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012655 }
12656 fprintf(stderr, "total size of all interned strings: "
12657 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12658 "mortal/immortal\n", mortal_size, immortal_size);
12659 Py_DECREF(keys);
12660 PyDict_Clear(interned);
12661 Py_DECREF(interned);
12662 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012663}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012664
12665
12666/********************* Unicode Iterator **************************/
12667
12668typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012669 PyObject_HEAD
12670 Py_ssize_t it_index;
12671 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012672} unicodeiterobject;
12673
12674static void
12675unicodeiter_dealloc(unicodeiterobject *it)
12676{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012677 _PyObject_GC_UNTRACK(it);
12678 Py_XDECREF(it->it_seq);
12679 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012680}
12681
12682static int
12683unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12684{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012685 Py_VISIT(it->it_seq);
12686 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012687}
12688
12689static PyObject *
12690unicodeiter_next(unicodeiterobject *it)
12691{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012692 PyUnicodeObject *seq;
12693 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012694
Benjamin Peterson14339b62009-01-31 16:36:08 +000012695 assert(it != NULL);
12696 seq = it->it_seq;
12697 if (seq == NULL)
12698 return NULL;
12699 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12702 int kind = PyUnicode_KIND(seq);
12703 void *data = PyUnicode_DATA(seq);
12704 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12705 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012706 if (item != NULL)
12707 ++it->it_index;
12708 return item;
12709 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012710
Benjamin Peterson14339b62009-01-31 16:36:08 +000012711 Py_DECREF(seq);
12712 it->it_seq = NULL;
12713 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012714}
12715
12716static PyObject *
12717unicodeiter_len(unicodeiterobject *it)
12718{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012719 Py_ssize_t len = 0;
12720 if (it->it_seq)
12721 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12722 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012723}
12724
12725PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12726
12727static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012728 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012729 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012730 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012731};
12732
12733PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012734 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12735 "str_iterator", /* tp_name */
12736 sizeof(unicodeiterobject), /* tp_basicsize */
12737 0, /* tp_itemsize */
12738 /* methods */
12739 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12740 0, /* tp_print */
12741 0, /* tp_getattr */
12742 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012743 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012744 0, /* tp_repr */
12745 0, /* tp_as_number */
12746 0, /* tp_as_sequence */
12747 0, /* tp_as_mapping */
12748 0, /* tp_hash */
12749 0, /* tp_call */
12750 0, /* tp_str */
12751 PyObject_GenericGetAttr, /* tp_getattro */
12752 0, /* tp_setattro */
12753 0, /* tp_as_buffer */
12754 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12755 0, /* tp_doc */
12756 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12757 0, /* tp_clear */
12758 0, /* tp_richcompare */
12759 0, /* tp_weaklistoffset */
12760 PyObject_SelfIter, /* tp_iter */
12761 (iternextfunc)unicodeiter_next, /* tp_iternext */
12762 unicodeiter_methods, /* tp_methods */
12763 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012764};
12765
12766static PyObject *
12767unicode_iter(PyObject *seq)
12768{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012769 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012770
Benjamin Peterson14339b62009-01-31 16:36:08 +000012771 if (!PyUnicode_Check(seq)) {
12772 PyErr_BadInternalCall();
12773 return NULL;
12774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 if (PyUnicode_READY(seq) == -1)
12776 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012777 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12778 if (it == NULL)
12779 return NULL;
12780 it->it_index = 0;
12781 Py_INCREF(seq);
12782 it->it_seq = (PyUnicodeObject *)seq;
12783 _PyObject_GC_TRACK(it);
12784 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012785}
12786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012787#define UNIOP(x) Py_UNICODE_##x
12788#define UNIOP_t Py_UNICODE
12789#include "uniops.h"
12790#undef UNIOP
12791#undef UNIOP_t
12792#define UNIOP(x) Py_UCS4_##x
12793#define UNIOP_t Py_UCS4
12794#include "uniops.h"
12795#undef UNIOP
12796#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012797
Victor Stinner71133ff2010-09-01 23:43:53 +000012798Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012799PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012800{
12801 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12802 Py_UNICODE *copy;
12803 Py_ssize_t size;
12804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012805 if (!PyUnicode_Check(unicode)) {
12806 PyErr_BadArgument();
12807 return NULL;
12808 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012809 /* Ensure we won't overflow the size. */
12810 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12811 PyErr_NoMemory();
12812 return NULL;
12813 }
12814 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12815 size *= sizeof(Py_UNICODE);
12816 copy = PyMem_Malloc(size);
12817 if (copy == NULL) {
12818 PyErr_NoMemory();
12819 return NULL;
12820 }
12821 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12822 return copy;
12823}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012824
Georg Brandl66c221e2010-10-14 07:04:07 +000012825/* A _string module, to export formatter_parser and formatter_field_name_split
12826 to the string.Formatter class implemented in Python. */
12827
12828static PyMethodDef _string_methods[] = {
12829 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12830 METH_O, PyDoc_STR("split the argument as a field name")},
12831 {"formatter_parser", (PyCFunction) formatter_parser,
12832 METH_O, PyDoc_STR("parse the argument as a format string")},
12833 {NULL, NULL}
12834};
12835
12836static struct PyModuleDef _string_module = {
12837 PyModuleDef_HEAD_INIT,
12838 "_string",
12839 PyDoc_STR("string helper module"),
12840 0,
12841 _string_methods,
12842 NULL,
12843 NULL,
12844 NULL,
12845 NULL
12846};
12847
12848PyMODINIT_FUNC
12849PyInit__string(void)
12850{
12851 return PyModule_Create(&_string_module);
12852}
12853
12854
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012855#ifdef __cplusplus
12856}
12857#endif