blob: 8f710bd29d24a29cdcf78c15a406afe5d4dae59e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092/* Generic helper macro to convert characters of different types.
93 from_type and to_type have to be valid type names, begin and end
94 are pointers to the source characters which should be of type
95 "from_type *". to is a pointer of type "to_type *" and points to the
96 buffer where the result characters are written to. */
97#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
98 do { \
99 const from_type *iter_; to_type *to_; \
100 for (iter_ = (begin), to_ = (to_type *)(to); \
101 iter_ < (end); \
102 ++iter_, ++to_) { \
103 *to_ = (to_type)*iter_; \
104 } \
105 } while (0)
106
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107#define _PyUnicode_UTF8(op) \
108 (((PyCompactUnicodeObject*)(op))->utf8)
109#define PyUnicode_UTF8(op) \
110 (assert(PyUnicode_Check(op)), \
111 assert(PyUnicode_IS_READY(op)), \
112 PyUnicode_IS_COMPACT_ASCII(op) ? \
113 ((char*)((PyASCIIObject*)(op) + 1)) : \
114 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200115#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 (((PyCompactUnicodeObject*)(op))->utf8_length)
117#define PyUnicode_UTF8_LENGTH(op) \
118 (assert(PyUnicode_Check(op)), \
119 assert(PyUnicode_IS_READY(op)), \
120 PyUnicode_IS_COMPACT_ASCII(op) ? \
121 ((PyASCIIObject*)(op))->length : \
122 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
124#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
126#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
128#define _PyUnicode_KIND(op) \
129 (assert(PyUnicode_Check(op)), \
130 ((PyASCIIObject *)(op))->state.kind)
131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(PyUnicode_Check(op)), \
133 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200134#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200135
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200136/* The Unicode string has been modified: reset the hash */
137#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
138
Walter Dörwald16807132007-05-25 13:52:07 +0000139/* This dictionary holds all interned unicode strings. Note that references
140 to strings in this dictionary are *not* counted in the string's ob_refcnt.
141 When the interned string reaches a refcnt of 0 the string deallocation
142 function will delete the reference from this dictionary.
143
144 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000145 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000146*/
147static PyObject *interned;
148
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000149/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200150static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000151
152/* Single character Unicode strings in the Latin-1 range are being
153 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200154static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Fast detection of the most frequent whitespace characters */
157const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000159/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000161/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* case 0x000C: * FORM FEED */
163/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 1, 1, 1, 1, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000166/* case 0x001C: * FILE SEPARATOR */
167/* case 0x001D: * GROUP SEPARATOR */
168/* case 0x001E: * RECORD SEPARATOR */
169/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 1, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
Alexander Belopolsky40018472011-02-26 01:02:56 +0000187static PyObject *
188unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000189 PyObject **errorHandler,const char *encoding, const char *reason,
190 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
191 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
192
Alexander Belopolsky40018472011-02-26 01:02:56 +0000193static void
194raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300195 const char *encoding,
196 const Py_UNICODE *unicode, Py_ssize_t size,
197 Py_ssize_t startpos, Py_ssize_t endpos,
198 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000199
Christian Heimes190d79e2008-01-30 11:58:22 +0000200/* Same for linebreaks */
201static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000204/* 0x000B, * LINE TABULATION */
205/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000206/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* 0x001C, * FILE SEPARATOR */
210/* 0x001D, * GROUP SEPARATOR */
211/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 1, 1, 1, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300228/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
229 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000230Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000231PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000233#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000235#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 /* This is actually an illegal character, so it should
237 not be passed to unichr. */
238 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000239#endif
240}
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242/* --- Bloom Filters ----------------------------------------------------- */
243
244/* stuff to implement simple "bloom filters" for Unicode characters.
245 to keep things simple, we use a single bitmask, using the least 5
246 bits from each unicode characters as the bit index. */
247
248/* the linebreak mask is set up by Unicode_Init below */
249
Antoine Pitrouf068f942010-01-13 14:19:12 +0000250#if LONG_BIT >= 128
251#define BLOOM_WIDTH 128
252#elif LONG_BIT >= 64
253#define BLOOM_WIDTH 64
254#elif LONG_BIT >= 32
255#define BLOOM_WIDTH 32
256#else
257#error "LONG_BIT is smaller than 32"
258#endif
259
Thomas Wouters477c8d52006-05-27 19:21:47 +0000260#define BLOOM_MASK unsigned long
261
262static BLOOM_MASK bloom_linebreak;
263
Antoine Pitrouf068f942010-01-13 14:19:12 +0000264#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
265#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266
Benjamin Peterson29060642009-01-31 22:14:21 +0000267#define BLOOM_LINEBREAK(ch) \
268 ((ch) < 128U ? ascii_linebreak[(ch)] : \
269 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270
Alexander Belopolsky40018472011-02-26 01:02:56 +0000271Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200272make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273{
274 /* calculate simple bloom-style bitmask for a given unicode string */
275
Antoine Pitrouf068f942010-01-13 14:19:12 +0000276 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000277 Py_ssize_t i;
278
279 mask = 0;
280 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
283 return mask;
284}
285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200286#define BLOOM_MEMBER(mask, chr, str) \
287 (BLOOM(mask, chr) \
288 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000289
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290/* --- Unicode Object ----------------------------------------------------- */
291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200292static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200293fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
294
295Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
296 Py_ssize_t size, Py_UCS4 ch,
297 int direction)
298{
299 /* like wcschr, but doesn't stop at NULL characters */
300 Py_ssize_t i;
301 if (direction == 1) {
302 for(i = 0; i < size; i++)
303 if (PyUnicode_READ(kind, s, i) == ch)
304 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
305 }
306 else {
307 for(i = size-1; i >= 0; i--)
308 if (PyUnicode_READ(kind, s, i) == ch)
309 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
310 }
311 return NULL;
312}
313
Alexander Belopolsky40018472011-02-26 01:02:56 +0000314static int
315unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317{
318 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200320 /* Resizing is only supported for old unicode objects. */
321 assert(!PyUnicode_IS_COMPACT(unicode));
322 assert(_PyUnicode_WSTR(unicode) != NULL);
323
324 /* ... and only if they have not been readied yet, because
325 callees usually rely on the wstr representation when resizing. */
326 assert(unicode->data.any == NULL);
327
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000328 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200329 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000330 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000332 /* Resizing shared object (unicode_empty or single character
333 objects) in-place is not allowed. Use PyUnicode_Resize()
334 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000335
Benjamin Peterson14339b62009-01-31 16:36:08 +0000336 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200337 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
338 _PyUnicode_WSTR(unicode)[0] < 256U &&
339 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000341 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 return -1;
343 }
344
Thomas Wouters477c8d52006-05-27 19:21:47 +0000345 /* We allocate one more byte to make sure the string is Ux0000 terminated.
346 The overallocation is also used by fastsearch, which assumes that it's
347 safe to look at str[length] (without making any assumptions about what
348 it contains). */
349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 oldstr = _PyUnicode_WSTR(unicode);
351 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
352 sizeof(Py_UNICODE) * (length + 1));
353 if (!_PyUnicode_WSTR(unicode)) {
354 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 PyErr_NoMemory();
356 return -1;
357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 _PyUnicode_WSTR(unicode)[length] = 0;
359 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360
Benjamin Peterson29060642009-01-31 22:14:21 +0000361 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 if (unicode->data.any != NULL) {
363 PyObject_FREE(unicode->data.any);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200364 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != unicode->data.any) {
365 PyObject_FREE(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200367 _PyUnicode_UTF8(unicode) = NULL;
368 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369 unicode->data.any = NULL;
370 _PyUnicode_LENGTH(unicode) = 0;
371 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
372 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200374 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000375
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return 0;
377}
378
379/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000380 Ux0000 terminated; some code (e.g. new_identifier)
381 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382
383 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385
386*/
387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388#ifdef Py_DEBUG
389int unicode_old_new_calls = 0;
390#endif
391
Alexander Belopolsky40018472011-02-26 01:02:56 +0000392static PyUnicodeObject *
393_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394{
395 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200396 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 if (length == 0 && unicode_empty != NULL) {
400 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200401 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 }
403
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000404 /* Ensure we won't overflow the size. */
405 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
406 return (PyUnicodeObject *)PyErr_NoMemory();
407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200408 if (length < 0) {
409 PyErr_SetString(PyExc_SystemError,
410 "Negative size passed to _PyUnicode_New");
411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000412 }
413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414#ifdef Py_DEBUG
415 ++unicode_old_new_calls;
416#endif
417
418 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
419 if (unicode == NULL)
420 return NULL;
421 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
422 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
423 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 PyErr_NoMemory();
425 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200427
Jeremy Hyltond8082792003-09-16 19:41:39 +0000428 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000429 * the caller fails before initializing str -- unicode_resize()
430 * reads str[0], and the Keep-Alive optimization can keep memory
431 * allocated for str alive across a call to unicode_dealloc(unicode).
432 * We don't want unicode_resize to read uninitialized memory in
433 * that case.
434 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200435 _PyUnicode_WSTR(unicode)[0] = 0;
436 _PyUnicode_WSTR(unicode)[length] = 0;
437 _PyUnicode_WSTR_LENGTH(unicode) = length;
438 _PyUnicode_HASH(unicode) = -1;
439 _PyUnicode_STATE(unicode).interned = 0;
440 _PyUnicode_STATE(unicode).kind = 0;
441 _PyUnicode_STATE(unicode).compact = 0;
442 _PyUnicode_STATE(unicode).ready = 0;
443 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200444 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200445 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200446 _PyUnicode_UTF8(unicode) = NULL;
447 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000449
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000451 /* XXX UNREF/NEWREF interface should be more symmetrical */
452 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000453 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000454 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000455 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456}
457
Victor Stinnerf42dc442011-10-02 23:33:16 +0200458static const char*
459unicode_kind_name(PyObject *unicode)
460{
461 assert(PyUnicode_Check(unicode));
462 if (!PyUnicode_IS_COMPACT(unicode))
463 {
464 if (!PyUnicode_IS_READY(unicode))
465 return "wstr";
466 switch(PyUnicode_KIND(unicode))
467 {
468 case PyUnicode_1BYTE_KIND:
469 if (PyUnicode_IS_COMPACT_ASCII(unicode))
470 return "legacy ascii";
471 else
472 return "legacy latin1";
473 case PyUnicode_2BYTE_KIND:
474 return "legacy UCS2";
475 case PyUnicode_4BYTE_KIND:
476 return "legacy UCS4";
477 default:
478 return "<legacy invalid kind>";
479 }
480 }
481 assert(PyUnicode_IS_READY(unicode));
482 switch(PyUnicode_KIND(unicode))
483 {
484 case PyUnicode_1BYTE_KIND:
485 if (PyUnicode_IS_COMPACT_ASCII(unicode))
486 return "ascii";
487 else
488 return "compact latin1";
489 case PyUnicode_2BYTE_KIND:
490 return "compact UCS2";
491 case PyUnicode_4BYTE_KIND:
492 return "compact UCS4";
493 default:
494 return "<invalid compact kind>";
495 }
496}
497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200498#ifdef Py_DEBUG
499int unicode_new_new_calls = 0;
500
501/* Functions wrapping macros for use in debugger */
502char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200503 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200504}
505
506void *_PyUnicode_compact_data(void *unicode) {
507 return _PyUnicode_COMPACT_DATA(unicode);
508}
509void *_PyUnicode_data(void *unicode){
510 printf("obj %p\n", unicode);
511 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
512 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
513 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
514 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
515 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
516 return PyUnicode_DATA(unicode);
517}
518#endif
519
520PyObject *
521PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
522{
523 PyObject *obj;
524 PyCompactUnicodeObject *unicode;
525 void *data;
526 int kind_state;
527 int is_sharing = 0, is_ascii = 0;
528 Py_ssize_t char_size;
529 Py_ssize_t struct_size;
530
531 /* Optimization for empty strings */
532 if (size == 0 && unicode_empty != NULL) {
533 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200534 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535 }
536
537#ifdef Py_DEBUG
538 ++unicode_new_new_calls;
539#endif
540
541 struct_size = sizeof(PyCompactUnicodeObject);
542 if (maxchar < 128) {
543 kind_state = PyUnicode_1BYTE_KIND;
544 char_size = 1;
545 is_ascii = 1;
546 struct_size = sizeof(PyASCIIObject);
547 }
548 else if (maxchar < 256) {
549 kind_state = PyUnicode_1BYTE_KIND;
550 char_size = 1;
551 }
552 else if (maxchar < 65536) {
553 kind_state = PyUnicode_2BYTE_KIND;
554 char_size = 2;
555 if (sizeof(wchar_t) == 2)
556 is_sharing = 1;
557 }
558 else {
559 kind_state = PyUnicode_4BYTE_KIND;
560 char_size = 4;
561 if (sizeof(wchar_t) == 4)
562 is_sharing = 1;
563 }
564
565 /* Ensure we won't overflow the size. */
566 if (size < 0) {
567 PyErr_SetString(PyExc_SystemError,
568 "Negative size passed to PyUnicode_New");
569 return NULL;
570 }
571 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
572 return PyErr_NoMemory();
573
574 /* Duplicated allocation code from _PyObject_New() instead of a call to
575 * PyObject_New() so we are able to allocate space for the object and
576 * it's data buffer.
577 */
578 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
579 if (obj == NULL)
580 return PyErr_NoMemory();
581 obj = PyObject_INIT(obj, &PyUnicode_Type);
582 if (obj == NULL)
583 return NULL;
584
585 unicode = (PyCompactUnicodeObject *)obj;
586 if (is_ascii)
587 data = ((PyASCIIObject*)obj) + 1;
588 else
589 data = unicode + 1;
590 _PyUnicode_LENGTH(unicode) = size;
591 _PyUnicode_HASH(unicode) = -1;
592 _PyUnicode_STATE(unicode).interned = 0;
593 _PyUnicode_STATE(unicode).kind = kind_state;
594 _PyUnicode_STATE(unicode).compact = 1;
595 _PyUnicode_STATE(unicode).ready = 1;
596 _PyUnicode_STATE(unicode).ascii = is_ascii;
597 if (is_ascii) {
598 ((char*)data)[size] = 0;
599 _PyUnicode_WSTR(unicode) = NULL;
600 }
601 else if (kind_state == PyUnicode_1BYTE_KIND) {
602 ((char*)data)[size] = 0;
603 _PyUnicode_WSTR(unicode) = NULL;
604 _PyUnicode_WSTR_LENGTH(unicode) = 0;
605 unicode->utf8_length = 0;
606 unicode->utf8 = NULL;
607 }
608 else {
609 unicode->utf8 = NULL;
610 if (kind_state == PyUnicode_2BYTE_KIND)
611 ((Py_UCS2*)data)[size] = 0;
612 else /* kind_state == PyUnicode_4BYTE_KIND */
613 ((Py_UCS4*)data)[size] = 0;
614 if (is_sharing) {
615 _PyUnicode_WSTR_LENGTH(unicode) = size;
616 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
617 }
618 else {
619 _PyUnicode_WSTR_LENGTH(unicode) = 0;
620 _PyUnicode_WSTR(unicode) = NULL;
621 }
622 }
623 return obj;
624}
625
626#if SIZEOF_WCHAR_T == 2
627/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
628 will decode surrogate pairs, the other conversions are implemented as macros
629 for efficency.
630
631 This function assumes that unicode can hold one more code point than wstr
632 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200633static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
635 PyUnicodeObject *unicode)
636{
637 const wchar_t *iter;
638 Py_UCS4 *ucs4_out;
639
640 assert(unicode && PyUnicode_Check(unicode));
641 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
642 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
643
644 for (iter = begin; iter < end; ) {
645 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
646 _PyUnicode_GET_LENGTH(unicode)));
647 if (*iter >= 0xD800 && *iter <= 0xDBFF
648 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
649 {
650 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
651 iter += 2;
652 }
653 else {
654 *ucs4_out++ = *iter;
655 iter++;
656 }
657 }
658 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
659 _PyUnicode_GET_LENGTH(unicode)));
660
661 return 0;
662}
663#endif
664
Victor Stinnercd9950f2011-10-02 00:34:53 +0200665static int
666_PyUnicode_Dirty(PyObject *unicode)
667{
668 assert(PyUnicode_Check(unicode));
669 if (Py_REFCNT(unicode) != 1) {
670 PyErr_SetString(PyExc_ValueError,
671 "Cannot modify a string having more than 1 reference");
672 return -1;
673 }
674 _PyUnicode_DIRTY(unicode);
675 return 0;
676}
677
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200678Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200679PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
680 PyObject *from, Py_ssize_t from_start,
681 Py_ssize_t how_many)
682{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200683 unsigned int from_kind, to_kind;
684 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200685
Victor Stinnerb1536152011-09-30 02:26:10 +0200686 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
687 PyErr_BadInternalCall();
688 return -1;
689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200690
691 if (PyUnicode_READY(from))
692 return -1;
693 if (PyUnicode_READY(to))
694 return -1;
695
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200696 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200697 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
698 PyErr_Format(PyExc_ValueError,
699 "Cannot write %zi characters at %zi "
700 "in a string of %zi characters",
701 how_many, to_start, PyUnicode_GET_LENGTH(to));
702 return -1;
703 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200704 if (how_many == 0)
705 return 0;
706
Victor Stinnercd9950f2011-10-02 00:34:53 +0200707 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200708 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200710 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200711 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200712 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200713 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200714
Victor Stinnerf42dc442011-10-02 23:33:16 +0200715 if (from_kind == to_kind
716 /* deny latin1 => ascii */
717 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
718 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200719 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200720 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200721 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200722 + PyUnicode_KIND_SIZE(from_kind, from_start),
723 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200725 else if (from_kind == PyUnicode_1BYTE_KIND
726 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200727 {
728 _PyUnicode_CONVERT_BYTES(
729 Py_UCS1, Py_UCS2,
730 PyUnicode_1BYTE_DATA(from) + from_start,
731 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
732 PyUnicode_2BYTE_DATA(to) + to_start
733 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200734 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200735 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200736 && to_kind == PyUnicode_4BYTE_KIND)
737 {
738 _PyUnicode_CONVERT_BYTES(
739 Py_UCS1, Py_UCS4,
740 PyUnicode_1BYTE_DATA(from) + from_start,
741 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
742 PyUnicode_4BYTE_DATA(to) + to_start
743 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200744 }
745 else if (from_kind == PyUnicode_2BYTE_KIND
746 && to_kind == PyUnicode_4BYTE_KIND)
747 {
748 _PyUnicode_CONVERT_BYTES(
749 Py_UCS2, Py_UCS4,
750 PyUnicode_2BYTE_DATA(from) + from_start,
751 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
752 PyUnicode_4BYTE_DATA(to) + to_start
753 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200754 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200755 else {
756 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200757
758 /* check if max_char(from substring) <= max_char(to) */
759 if (from_kind > to_kind
760 /* latin1 => ascii */
761 || (PyUnicode_IS_COMPACT_ASCII(to)
762 && to_kind == PyUnicode_1BYTE_KIND
763 && !PyUnicode_IS_COMPACT_ASCII(from)))
764 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200765 /* slow path to check for character overflow */
766 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
767 Py_UCS4 ch, maxchar;
768 Py_ssize_t i;
769
770 maxchar = 0;
771 invalid_kinds = 0;
772 for (i=0; i < how_many; i++) {
773 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
774 if (ch > maxchar) {
775 maxchar = ch;
776 if (maxchar > to_maxchar) {
777 invalid_kinds = 1;
778 break;
779 }
780 }
781 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
782 }
783 }
784 else
785 invalid_kinds = 1;
786 if (invalid_kinds) {
787 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200788 "Cannot copy %s characters "
789 "into a string of %s characters",
790 unicode_kind_name(from),
791 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200792 return -1;
793 }
794 }
795 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200796}
797
Victor Stinner17222162011-09-28 22:15:37 +0200798/* Find the maximum code point and count the number of surrogate pairs so a
799 correct string length can be computed before converting a string to UCS4.
800 This function counts single surrogates as a character and not as a pair.
801
802 Return 0 on success, or -1 on error. */
803static int
804find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
805 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200806{
807 const wchar_t *iter;
808
Victor Stinnerc53be962011-10-02 21:33:54 +0200809 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810 if (num_surrogates == NULL || maxchar == NULL) {
811 PyErr_SetString(PyExc_SystemError,
812 "unexpected NULL arguments to "
813 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
814 return -1;
815 }
816
817 *num_surrogates = 0;
818 *maxchar = 0;
819
820 for (iter = begin; iter < end; ) {
821 if (*iter > *maxchar)
822 *maxchar = *iter;
823#if SIZEOF_WCHAR_T == 2
824 if (*iter >= 0xD800 && *iter <= 0xDBFF
825 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
826 {
827 Py_UCS4 surrogate_val;
828 surrogate_val = (((iter[0] & 0x3FF)<<10)
829 | (iter[1] & 0x3FF)) + 0x10000;
830 ++(*num_surrogates);
831 if (surrogate_val > *maxchar)
832 *maxchar = surrogate_val;
833 iter += 2;
834 }
835 else
836 iter++;
837#else
838 iter++;
839#endif
840 }
841 return 0;
842}
843
844#ifdef Py_DEBUG
845int unicode_ready_calls = 0;
846#endif
847
848int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200849_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200851 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852 wchar_t *end;
853 Py_UCS4 maxchar = 0;
854 Py_ssize_t num_surrogates;
855#if SIZEOF_WCHAR_T == 2
856 Py_ssize_t length_wo_surrogates;
857#endif
858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200860 strings were created using _PyObject_New() and where no canonical
861 representation (the str field) has been set yet aka strings
862 which are not yet ready. */
863 assert(PyUnicode_Check(obj));
864 assert(!PyUnicode_IS_READY(obj));
865 assert(!PyUnicode_IS_COMPACT(obj));
866 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200867 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +0200868 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200869 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200870 /* Actually, it should neither be interned nor be anything else: */
871 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200872
873#ifdef Py_DEBUG
874 ++unicode_ready_calls;
875#endif
876
877 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200878 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200879 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881
882 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +0200883 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
884 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885 PyErr_NoMemory();
886 return -1;
887 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200888 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889 _PyUnicode_WSTR(unicode), end,
890 PyUnicode_1BYTE_DATA(unicode));
891 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
892 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
893 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
894 if (maxchar < 128) {
Victor Stinnerc3c74152011-10-02 20:39:55 +0200895 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200896 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 }
898 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200899 _PyUnicode_UTF8(unicode) = NULL;
900 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901 }
902 PyObject_FREE(_PyUnicode_WSTR(unicode));
903 _PyUnicode_WSTR(unicode) = NULL;
904 _PyUnicode_WSTR_LENGTH(unicode) = 0;
905 }
906 /* In this case we might have to convert down from 4-byte native
907 wchar_t to 2-byte unicode. */
908 else if (maxchar < 65536) {
909 assert(num_surrogates == 0 &&
910 "FindMaxCharAndNumSurrogatePairs() messed up");
911
Victor Stinner506f5922011-09-28 22:34:18 +0200912#if SIZEOF_WCHAR_T == 2
913 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +0200914 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +0200915 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
916 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
917 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200918 _PyUnicode_UTF8(unicode) = NULL;
919 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200920#else
921 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +0200922 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +0200923 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +0200924 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +0200925 PyErr_NoMemory();
926 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927 }
Victor Stinner506f5922011-09-28 22:34:18 +0200928 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
929 _PyUnicode_WSTR(unicode), end,
930 PyUnicode_2BYTE_DATA(unicode));
931 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
932 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
933 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200934 _PyUnicode_UTF8(unicode) = NULL;
935 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200936 PyObject_FREE(_PyUnicode_WSTR(unicode));
937 _PyUnicode_WSTR(unicode) = NULL;
938 _PyUnicode_WSTR_LENGTH(unicode) = 0;
939#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940 }
941 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
942 else {
943#if SIZEOF_WCHAR_T == 2
944 /* in case the native representation is 2-bytes, we need to allocate a
945 new normalized 4-byte version. */
946 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200947 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
948 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200949 PyErr_NoMemory();
950 return -1;
951 }
952 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
953 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200954 _PyUnicode_UTF8(unicode) = NULL;
955 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinnerc53be962011-10-02 21:33:54 +0200956 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957 PyObject_FREE(_PyUnicode_WSTR(unicode));
958 _PyUnicode_WSTR(unicode) = NULL;
959 _PyUnicode_WSTR_LENGTH(unicode) = 0;
960#else
961 assert(num_surrogates == 0);
962
Victor Stinnerc3c74152011-10-02 20:39:55 +0200963 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200964 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200965 _PyUnicode_UTF8(unicode) = NULL;
966 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
968#endif
969 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
970 }
971 _PyUnicode_STATE(unicode).ready = 1;
972 return 0;
973}
974
Alexander Belopolsky40018472011-02-26 01:02:56 +0000975static void
976unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000977{
Walter Dörwald16807132007-05-25 13:52:07 +0000978 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000979 case SSTATE_NOT_INTERNED:
980 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000981
Benjamin Peterson29060642009-01-31 22:14:21 +0000982 case SSTATE_INTERNED_MORTAL:
983 /* revive dead object temporarily for DelItem */
984 Py_REFCNT(unicode) = 3;
985 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
986 Py_FatalError(
987 "deletion of interned string failed");
988 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000989
Benjamin Peterson29060642009-01-31 22:14:21 +0000990 case SSTATE_INTERNED_IMMORTAL:
991 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000992
Benjamin Peterson29060642009-01-31 22:14:21 +0000993 default:
994 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000995 }
996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200997 if (_PyUnicode_WSTR(unicode) &&
998 (!PyUnicode_IS_READY(unicode) ||
999 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1000 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001001 if (!PyUnicode_IS_COMPACT_ASCII(unicode)
1002 && _PyUnicode_UTF8(unicode)
1003 && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
1004 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005
1006 if (PyUnicode_IS_COMPACT(unicode)) {
1007 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008 }
1009 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001010 if (_PyUnicode_DATA_ANY(unicode))
1011 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001012 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001013 }
1014}
1015
Alexander Belopolsky40018472011-02-26 01:02:56 +00001016static int
1017_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001018{
1019 register PyUnicodeObject *v;
1020
1021 /* Argument checks */
1022 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001023 PyErr_BadInternalCall();
1024 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001025 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001026 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
1028 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001029 PyErr_BadInternalCall();
1030 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001031 }
1032
1033 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001034 possible since these are being shared.
1035 The same goes for new-representation unicode objects or objects which
1036 have already been readied.
1037 For these, we simply return a fresh copy with the same Unicode content.
1038 */
1039 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
1040 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
1041 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001042 PyUnicodeObject *w = _PyUnicode_New(length);
1043 if (w == NULL)
1044 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
1046 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +00001047 Py_DECREF(*unicode);
1048 *unicode = w;
1049 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001050 }
1051
1052 /* Note that we don't have to modify *unicode for unshared Unicode
1053 objects, since we can modify them in-place. */
1054 return unicode_resize(v, length);
1055}
1056
Alexander Belopolsky40018472011-02-26 01:02:56 +00001057int
1058PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001059{
1060 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
1061}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063static PyObject*
1064get_latin1_char(unsigned char ch)
1065{
Victor Stinnera464fc12011-10-02 20:39:30 +02001066 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001068 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 if (!unicode)
1070 return NULL;
1071 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1072 unicode_latin1[ch] = unicode;
1073 }
1074 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001075 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001076}
1077
Alexander Belopolsky40018472011-02-26 01:02:56 +00001078PyObject *
1079PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080{
1081 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 Py_UCS4 maxchar = 0;
1083 Py_ssize_t num_surrogates;
1084
1085 if (u == NULL)
1086 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001088 /* If the Unicode data is known at construction time, we can apply
1089 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001091 /* Optimization for empty strings */
1092 if (size == 0 && unicode_empty != NULL) {
1093 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001094 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001095 }
Tim Petersced69f82003-09-16 20:30:58 +00001096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 /* Single character Unicode objects in the Latin-1 range are
1098 shared when using this constructor */
1099 if (size == 1 && *u < 256)
1100 return get_latin1_char((unsigned char)*u);
1101
1102 /* If not empty and not single character, copy the Unicode data
1103 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001104 if (find_maxchar_surrogates(u, u + size,
1105 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 return NULL;
1107
1108 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1109 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 if (!unicode)
1111 return NULL;
1112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 switch (PyUnicode_KIND(unicode)) {
1114 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001115 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1117 break;
1118 case PyUnicode_2BYTE_KIND:
1119#if Py_UNICODE_SIZE == 2
1120 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1121#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001122 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1124#endif
1125 break;
1126 case PyUnicode_4BYTE_KIND:
1127#if SIZEOF_WCHAR_T == 2
1128 /* This is the only case which has to process surrogates, thus
1129 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001130 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131#else
1132 assert(num_surrogates == 0);
1133 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1134#endif
1135 break;
1136 default:
1137 assert(0 && "Impossible state");
1138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001139
1140 return (PyObject *)unicode;
1141}
1142
Alexander Belopolsky40018472011-02-26 01:02:56 +00001143PyObject *
1144PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001145{
1146 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001147
Benjamin Peterson14339b62009-01-31 16:36:08 +00001148 if (size < 0) {
1149 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001150 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001151 return NULL;
1152 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001153
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001154 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001155 some optimizations which share commonly used objects.
1156 Also, this means the input must be UTF-8, so fall back to the
1157 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001158 if (u != NULL) {
1159
Benjamin Peterson29060642009-01-31 22:14:21 +00001160 /* Optimization for empty strings */
1161 if (size == 0 && unicode_empty != NULL) {
1162 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001163 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001164 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001165
1166 /* Single characters are shared when using this constructor.
1167 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 if (size == 1 && Py_CHARMASK(*u) < 128)
1169 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001170
1171 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001172 }
1173
Walter Dörwald55507312007-05-18 13:12:10 +00001174 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001175 if (!unicode)
1176 return NULL;
1177
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001178 return (PyObject *)unicode;
1179}
1180
Alexander Belopolsky40018472011-02-26 01:02:56 +00001181PyObject *
1182PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001183{
1184 size_t size = strlen(u);
1185 if (size > PY_SSIZE_T_MAX) {
1186 PyErr_SetString(PyExc_OverflowError, "input too long");
1187 return NULL;
1188 }
1189
1190 return PyUnicode_FromStringAndSize(u, size);
1191}
1192
Victor Stinnere57b1c02011-09-28 22:20:48 +02001193static PyObject*
1194_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001196 PyObject *res;
1197 unsigned char max = 127;
1198 Py_ssize_t i;
1199 for (i = 0; i < size; i++) {
1200 if (u[i] & 0x80) {
1201 max = 255;
1202 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001203 }
1204 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001205 res = PyUnicode_New(size, max);
1206 if (!res)
1207 return NULL;
1208 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1209 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001210}
1211
Victor Stinnere57b1c02011-09-28 22:20:48 +02001212static PyObject*
1213_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001214{
1215 PyObject *res;
1216 Py_UCS2 max = 0;
1217 Py_ssize_t i;
1218 for (i = 0; i < size; i++)
1219 if (u[i] > max)
1220 max = u[i];
1221 res = PyUnicode_New(size, max);
1222 if (!res)
1223 return NULL;
1224 if (max >= 256)
1225 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1226 else
1227 for (i = 0; i < size; i++)
1228 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1229 return res;
1230}
1231
Victor Stinnere57b1c02011-09-28 22:20:48 +02001232static PyObject*
1233_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234{
1235 PyObject *res;
1236 Py_UCS4 max = 0;
1237 Py_ssize_t i;
1238 for (i = 0; i < size; i++)
1239 if (u[i] > max)
1240 max = u[i];
1241 res = PyUnicode_New(size, max);
1242 if (!res)
1243 return NULL;
1244 if (max >= 0x10000)
1245 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1246 else {
1247 int kind = PyUnicode_KIND(res);
1248 void *data = PyUnicode_DATA(res);
1249 for (i = 0; i < size; i++)
1250 PyUnicode_WRITE(kind, data, i, u[i]);
1251 }
1252 return res;
1253}
1254
1255PyObject*
1256PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1257{
1258 switch(kind) {
1259 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001260 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001261 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001262 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001264 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001265 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001266 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267 return NULL;
1268}
1269
Victor Stinner034f6cf2011-09-30 02:26:44 +02001270PyObject*
1271PyUnicode_Copy(PyObject *unicode)
1272{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001273 Py_ssize_t size;
1274 PyObject *copy;
1275 void *data;
1276
Victor Stinner034f6cf2011-09-30 02:26:44 +02001277 if (!PyUnicode_Check(unicode)) {
1278 PyErr_BadInternalCall();
1279 return NULL;
1280 }
1281 if (PyUnicode_READY(unicode))
1282 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001283
1284 size = PyUnicode_GET_LENGTH(unicode);
1285 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1286 if (!copy)
1287 return NULL;
1288 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1289
1290 data = PyUnicode_DATA(unicode);
1291 switch (PyUnicode_KIND(unicode))
1292 {
1293 case PyUnicode_1BYTE_KIND:
1294 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1295 break;
1296 case PyUnicode_2BYTE_KIND:
1297 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1298 break;
1299 case PyUnicode_4BYTE_KIND:
1300 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1301 break;
1302 default:
1303 assert(0);
1304 break;
1305 }
1306 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001307}
1308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309
Victor Stinnerbc603d12011-10-02 01:00:40 +02001310/* Widen Unicode objects to larger buffers. Don't write terminating null
1311 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312
1313void*
1314_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1315{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001316 Py_ssize_t len;
1317 void *result;
1318 unsigned int skind;
1319
1320 if (PyUnicode_READY(s))
1321 return NULL;
1322
1323 len = PyUnicode_GET_LENGTH(s);
1324 skind = PyUnicode_KIND(s);
1325 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1327 return NULL;
1328 }
1329 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001330 case PyUnicode_2BYTE_KIND:
1331 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1332 if (!result)
1333 return PyErr_NoMemory();
1334 assert(skind == PyUnicode_1BYTE_KIND);
1335 _PyUnicode_CONVERT_BYTES(
1336 Py_UCS1, Py_UCS2,
1337 PyUnicode_1BYTE_DATA(s),
1338 PyUnicode_1BYTE_DATA(s) + len,
1339 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001341 case PyUnicode_4BYTE_KIND:
1342 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1343 if (!result)
1344 return PyErr_NoMemory();
1345 if (skind == PyUnicode_2BYTE_KIND) {
1346 _PyUnicode_CONVERT_BYTES(
1347 Py_UCS2, Py_UCS4,
1348 PyUnicode_2BYTE_DATA(s),
1349 PyUnicode_2BYTE_DATA(s) + len,
1350 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001352 else {
1353 assert(skind == PyUnicode_1BYTE_KIND);
1354 _PyUnicode_CONVERT_BYTES(
1355 Py_UCS1, Py_UCS4,
1356 PyUnicode_1BYTE_DATA(s),
1357 PyUnicode_1BYTE_DATA(s) + len,
1358 result);
1359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001361 default:
1362 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001364 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 return NULL;
1366}
1367
1368static Py_UCS4*
1369as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1370 int copy_null)
1371{
1372 int kind;
1373 void *data;
1374 Py_ssize_t len, targetlen;
1375 if (PyUnicode_READY(string) == -1)
1376 return NULL;
1377 kind = PyUnicode_KIND(string);
1378 data = PyUnicode_DATA(string);
1379 len = PyUnicode_GET_LENGTH(string);
1380 targetlen = len;
1381 if (copy_null)
1382 targetlen++;
1383 if (!target) {
1384 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1385 PyErr_NoMemory();
1386 return NULL;
1387 }
1388 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1389 if (!target) {
1390 PyErr_NoMemory();
1391 return NULL;
1392 }
1393 }
1394 else {
1395 if (targetsize < targetlen) {
1396 PyErr_Format(PyExc_SystemError,
1397 "string is longer than the buffer");
1398 if (copy_null && 0 < targetsize)
1399 target[0] = 0;
1400 return NULL;
1401 }
1402 }
1403 if (kind != PyUnicode_4BYTE_KIND) {
1404 Py_ssize_t i;
1405 for (i = 0; i < len; i++)
1406 target[i] = PyUnicode_READ(kind, data, i);
1407 }
1408 else
1409 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1410 if (copy_null)
1411 target[len] = 0;
1412 return target;
1413}
1414
1415Py_UCS4*
1416PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1417 int copy_null)
1418{
1419 if (target == NULL || targetsize < 1) {
1420 PyErr_BadInternalCall();
1421 return NULL;
1422 }
1423 return as_ucs4(string, target, targetsize, copy_null);
1424}
1425
1426Py_UCS4*
1427PyUnicode_AsUCS4Copy(PyObject *string)
1428{
1429 return as_ucs4(string, NULL, 0, 1);
1430}
1431
1432#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001433
Alexander Belopolsky40018472011-02-26 01:02:56 +00001434PyObject *
1435PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001438 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001440 PyErr_BadInternalCall();
1441 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001442 }
1443
Martin v. Löwis790465f2008-04-05 20:41:37 +00001444 if (size == -1) {
1445 size = wcslen(w);
1446 }
1447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001449}
1450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001452
Walter Dörwald346737f2007-05-31 10:44:43 +00001453static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001454makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1455 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001456{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001457 *fmt++ = '%';
1458 if (width) {
1459 if (zeropad)
1460 *fmt++ = '0';
1461 fmt += sprintf(fmt, "%d", width);
1462 }
1463 if (precision)
1464 fmt += sprintf(fmt, ".%d", precision);
1465 if (longflag)
1466 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001467 else if (longlongflag) {
1468 /* longlongflag should only ever be nonzero on machines with
1469 HAVE_LONG_LONG defined */
1470#ifdef HAVE_LONG_LONG
1471 char *f = PY_FORMAT_LONG_LONG;
1472 while (*f)
1473 *fmt++ = *f++;
1474#else
1475 /* we shouldn't ever get here */
1476 assert(0);
1477 *fmt++ = 'l';
1478#endif
1479 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001480 else if (size_tflag) {
1481 char *f = PY_FORMAT_SIZE_T;
1482 while (*f)
1483 *fmt++ = *f++;
1484 }
1485 *fmt++ = c;
1486 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001487}
1488
Victor Stinner96865452011-03-01 23:44:09 +00001489/* helper for PyUnicode_FromFormatV() */
1490
1491static const char*
1492parse_format_flags(const char *f,
1493 int *p_width, int *p_precision,
1494 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1495{
1496 int width, precision, longflag, longlongflag, size_tflag;
1497
1498 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1499 f++;
1500 width = 0;
1501 while (Py_ISDIGIT((unsigned)*f))
1502 width = (width*10) + *f++ - '0';
1503 precision = 0;
1504 if (*f == '.') {
1505 f++;
1506 while (Py_ISDIGIT((unsigned)*f))
1507 precision = (precision*10) + *f++ - '0';
1508 if (*f == '%') {
1509 /* "%.3%s" => f points to "3" */
1510 f--;
1511 }
1512 }
1513 if (*f == '\0') {
1514 /* bogus format "%.1" => go backward, f points to "1" */
1515 f--;
1516 }
1517 if (p_width != NULL)
1518 *p_width = width;
1519 if (p_precision != NULL)
1520 *p_precision = precision;
1521
1522 /* Handle %ld, %lu, %lld and %llu. */
1523 longflag = 0;
1524 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001525 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001526
1527 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001528 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001529 longflag = 1;
1530 ++f;
1531 }
1532#ifdef HAVE_LONG_LONG
1533 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001534 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001535 longlongflag = 1;
1536 f += 2;
1537 }
1538#endif
1539 }
1540 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001541 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001542 size_tflag = 1;
1543 ++f;
1544 }
1545 if (p_longflag != NULL)
1546 *p_longflag = longflag;
1547 if (p_longlongflag != NULL)
1548 *p_longlongflag = longlongflag;
1549 if (p_size_tflag != NULL)
1550 *p_size_tflag = size_tflag;
1551 return f;
1552}
1553
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001554/* maximum number of characters required for output of %ld. 21 characters
1555 allows for 64-bit integers (in decimal) and an optional sign. */
1556#define MAX_LONG_CHARS 21
1557/* maximum number of characters required for output of %lld.
1558 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1559 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1560#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1561
Walter Dörwaldd2034312007-05-18 16:29:38 +00001562PyObject *
1563PyUnicode_FromFormatV(const char *format, va_list vargs)
1564{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001565 va_list count;
1566 Py_ssize_t callcount = 0;
1567 PyObject **callresults = NULL;
1568 PyObject **callresult = NULL;
1569 Py_ssize_t n = 0;
1570 int width = 0;
1571 int precision = 0;
1572 int zeropad;
1573 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001575 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001576 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1578 Py_UCS4 argmaxchar;
1579 Py_ssize_t numbersize = 0;
1580 char *numberresults = NULL;
1581 char *numberresult = NULL;
1582 Py_ssize_t i;
1583 int kind;
1584 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001585
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001586 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001587 /* step 1: count the number of %S/%R/%A/%s format specifications
1588 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1589 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590 * result in an array)
1591 * also esimate a upper bound for all the number formats in the string,
1592 * numbers will be formated in step 3 and be keept in a '\0'-separated
1593 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001594 for (f = format; *f; f++) {
1595 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001596 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1598 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1599 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1600 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001603#ifdef HAVE_LONG_LONG
1604 if (longlongflag) {
1605 if (width < MAX_LONG_LONG_CHARS)
1606 width = MAX_LONG_LONG_CHARS;
1607 }
1608 else
1609#endif
1610 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1611 including sign. Decimal takes the most space. This
1612 isn't enough for octal. If a width is specified we
1613 need more (which we allocate later). */
1614 if (width < MAX_LONG_CHARS)
1615 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616
1617 /* account for the size + '\0' to separate numbers
1618 inside of the numberresults buffer */
1619 numbersize += (width + 1);
1620 }
1621 }
1622 else if ((unsigned char)*f > 127) {
1623 PyErr_Format(PyExc_ValueError,
1624 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1625 "string, got a non-ASCII byte: 0x%02x",
1626 (unsigned char)*f);
1627 return NULL;
1628 }
1629 }
1630 /* step 2: allocate memory for the results of
1631 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1632 if (callcount) {
1633 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1634 if (!callresults) {
1635 PyErr_NoMemory();
1636 return NULL;
1637 }
1638 callresult = callresults;
1639 }
1640 /* step 2.5: allocate memory for the results of formating numbers */
1641 if (numbersize) {
1642 numberresults = PyObject_Malloc(numbersize);
1643 if (!numberresults) {
1644 PyErr_NoMemory();
1645 goto fail;
1646 }
1647 numberresult = numberresults;
1648 }
1649
1650 /* step 3: format numbers and figure out how large a buffer we need */
1651 for (f = format; *f; f++) {
1652 if (*f == '%') {
1653 const char* p;
1654 int longflag;
1655 int longlongflag;
1656 int size_tflag;
1657 int numprinted;
1658
1659 p = f;
1660 zeropad = (f[1] == '0');
1661 f = parse_format_flags(f, &width, &precision,
1662 &longflag, &longlongflag, &size_tflag);
1663 switch (*f) {
1664 case 'c':
1665 {
1666 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001667 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001668 n++;
1669 break;
1670 }
1671 case '%':
1672 n++;
1673 break;
1674 case 'i':
1675 case 'd':
1676 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1677 width, precision, *f);
1678 if (longflag)
1679 numprinted = sprintf(numberresult, fmt,
1680 va_arg(count, long));
1681#ifdef HAVE_LONG_LONG
1682 else if (longlongflag)
1683 numprinted = sprintf(numberresult, fmt,
1684 va_arg(count, PY_LONG_LONG));
1685#endif
1686 else if (size_tflag)
1687 numprinted = sprintf(numberresult, fmt,
1688 va_arg(count, Py_ssize_t));
1689 else
1690 numprinted = sprintf(numberresult, fmt,
1691 va_arg(count, int));
1692 n += numprinted;
1693 /* advance by +1 to skip over the '\0' */
1694 numberresult += (numprinted + 1);
1695 assert(*(numberresult - 1) == '\0');
1696 assert(*(numberresult - 2) != '\0');
1697 assert(numprinted >= 0);
1698 assert(numberresult <= numberresults + numbersize);
1699 break;
1700 case 'u':
1701 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1702 width, precision, 'u');
1703 if (longflag)
1704 numprinted = sprintf(numberresult, fmt,
1705 va_arg(count, unsigned long));
1706#ifdef HAVE_LONG_LONG
1707 else if (longlongflag)
1708 numprinted = sprintf(numberresult, fmt,
1709 va_arg(count, unsigned PY_LONG_LONG));
1710#endif
1711 else if (size_tflag)
1712 numprinted = sprintf(numberresult, fmt,
1713 va_arg(count, size_t));
1714 else
1715 numprinted = sprintf(numberresult, fmt,
1716 va_arg(count, unsigned int));
1717 n += numprinted;
1718 numberresult += (numprinted + 1);
1719 assert(*(numberresult - 1) == '\0');
1720 assert(*(numberresult - 2) != '\0');
1721 assert(numprinted >= 0);
1722 assert(numberresult <= numberresults + numbersize);
1723 break;
1724 case 'x':
1725 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1726 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1727 n += numprinted;
1728 numberresult += (numprinted + 1);
1729 assert(*(numberresult - 1) == '\0');
1730 assert(*(numberresult - 2) != '\0');
1731 assert(numprinted >= 0);
1732 assert(numberresult <= numberresults + numbersize);
1733 break;
1734 case 'p':
1735 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1736 /* %p is ill-defined: ensure leading 0x. */
1737 if (numberresult[1] == 'X')
1738 numberresult[1] = 'x';
1739 else if (numberresult[1] != 'x') {
1740 memmove(numberresult + 2, numberresult,
1741 strlen(numberresult) + 1);
1742 numberresult[0] = '0';
1743 numberresult[1] = 'x';
1744 numprinted += 2;
1745 }
1746 n += numprinted;
1747 numberresult += (numprinted + 1);
1748 assert(*(numberresult - 1) == '\0');
1749 assert(*(numberresult - 2) != '\0');
1750 assert(numprinted >= 0);
1751 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001752 break;
1753 case 's':
1754 {
1755 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001756 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001757 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1758 if (!str)
1759 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 /* since PyUnicode_DecodeUTF8 returns already flexible
1761 unicode objects, there is no need to call ready on them */
1762 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001763 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001765 /* Remember the str and switch to the next slot */
1766 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001767 break;
1768 }
1769 case 'U':
1770 {
1771 PyObject *obj = va_arg(count, PyObject *);
1772 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 if (PyUnicode_READY(obj) == -1)
1774 goto fail;
1775 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001776 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001778 break;
1779 }
1780 case 'V':
1781 {
1782 PyObject *obj = va_arg(count, PyObject *);
1783 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001784 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001785 assert(obj || str);
1786 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001787 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 if (PyUnicode_READY(obj) == -1)
1789 goto fail;
1790 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001791 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001793 *callresult++ = NULL;
1794 }
1795 else {
1796 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1797 if (!str_obj)
1798 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001800 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001802 *callresult++ = str_obj;
1803 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001804 break;
1805 }
1806 case 'S':
1807 {
1808 PyObject *obj = va_arg(count, PyObject *);
1809 PyObject *str;
1810 assert(obj);
1811 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001813 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001815 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001817 /* Remember the str and switch to the next slot */
1818 *callresult++ = str;
1819 break;
1820 }
1821 case 'R':
1822 {
1823 PyObject *obj = va_arg(count, PyObject *);
1824 PyObject *repr;
1825 assert(obj);
1826 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001828 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001830 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001832 /* Remember the repr and switch to the next slot */
1833 *callresult++ = repr;
1834 break;
1835 }
1836 case 'A':
1837 {
1838 PyObject *obj = va_arg(count, PyObject *);
1839 PyObject *ascii;
1840 assert(obj);
1841 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001843 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001845 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001847 /* Remember the repr and switch to the next slot */
1848 *callresult++ = ascii;
1849 break;
1850 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001851 default:
1852 /* if we stumble upon an unknown
1853 formatting code, copy the rest of
1854 the format string to the output
1855 string. (we cannot just skip the
1856 code, since there's no way to know
1857 what's in the argument list) */
1858 n += strlen(p);
1859 goto expand;
1860 }
1861 } else
1862 n++;
1863 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001864 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001865 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001867 we don't have to resize the string.
1868 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001869 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001870 if (!string)
1871 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001872 kind = PyUnicode_KIND(string);
1873 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001874 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001877 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001878 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001879 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001880
1881 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001882 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1883 /* checking for == because the last argument could be a empty
1884 string, which causes i to point to end, the assert at the end of
1885 the loop */
1886 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001887
Benjamin Peterson14339b62009-01-31 16:36:08 +00001888 switch (*f) {
1889 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001890 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891 const int ordinal = va_arg(vargs, int);
1892 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001893 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001894 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001895 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001896 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001897 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001898 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001899 case 'p':
1900 /* unused, since we already have the result */
1901 if (*f == 'p')
1902 (void) va_arg(vargs, void *);
1903 else
1904 (void) va_arg(vargs, int);
1905 /* extract the result from numberresults and append. */
1906 for (; *numberresult; ++i, ++numberresult)
1907 PyUnicode_WRITE(kind, data, i, *numberresult);
1908 /* skip over the separating '\0' */
1909 assert(*numberresult == '\0');
1910 numberresult++;
1911 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001912 break;
1913 case 's':
1914 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001915 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001917 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 size = PyUnicode_GET_LENGTH(*callresult);
1919 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001920 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1921 *callresult, 0,
1922 size) < 0)
1923 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001924 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001925 /* We're done with the unicode()/repr() => forget it */
1926 Py_DECREF(*callresult);
1927 /* switch to next unicode()/repr() result */
1928 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001929 break;
1930 }
1931 case 'U':
1932 {
1933 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934 Py_ssize_t size;
1935 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1936 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001937 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1938 obj, 0,
1939 size) < 0)
1940 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001942 break;
1943 }
1944 case 'V':
1945 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001947 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001948 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001949 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 size = PyUnicode_GET_LENGTH(obj);
1951 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001952 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1953 obj, 0,
1954 size) < 0)
1955 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001956 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001957 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 size = PyUnicode_GET_LENGTH(*callresult);
1959 assert(PyUnicode_KIND(*callresult) <=
1960 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001961 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1962 *callresult,
1963 0, size) < 0)
1964 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001965 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001966 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001967 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001968 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001969 break;
1970 }
1971 case 'S':
1972 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001973 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001974 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001975 /* unused, since we already have the result */
1976 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001978 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1979 *callresult, 0,
1980 PyUnicode_GET_LENGTH(*callresult)) < 0)
1981 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001982 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001983 /* We're done with the unicode()/repr() => forget it */
1984 Py_DECREF(*callresult);
1985 /* switch to next unicode()/repr() result */
1986 ++callresult;
1987 break;
1988 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001989 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001991 break;
1992 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 for (; *p; ++p, ++i)
1994 PyUnicode_WRITE(kind, data, i, *p);
1995 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001996 goto end;
1997 }
Victor Stinner1205f272010-09-11 00:54:47 +00001998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 else {
2000 assert(i < PyUnicode_GET_LENGTH(string));
2001 PyUnicode_WRITE(kind, data, i++, *f);
2002 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002003 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002005
Benjamin Peterson29060642009-01-31 22:14:21 +00002006 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002007 if (callresults)
2008 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 if (numberresults)
2010 PyObject_Free(numberresults);
2011 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002012 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002013 if (callresults) {
2014 PyObject **callresult2 = callresults;
2015 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002016 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002017 ++callresult2;
2018 }
2019 PyObject_Free(callresults);
2020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 if (numberresults)
2022 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002023 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002024}
2025
Walter Dörwaldd2034312007-05-18 16:29:38 +00002026PyObject *
2027PyUnicode_FromFormat(const char *format, ...)
2028{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002029 PyObject* ret;
2030 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002031
2032#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002033 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002034#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002035 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002036#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002037 ret = PyUnicode_FromFormatV(format, vargs);
2038 va_end(vargs);
2039 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002040}
2041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042#ifdef HAVE_WCHAR_H
2043
Victor Stinner5593d8a2010-10-02 11:11:27 +00002044/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2045 convert a Unicode object to a wide character string.
2046
Victor Stinnerd88d9832011-09-06 02:00:05 +02002047 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002048 character) required to convert the unicode object. Ignore size argument.
2049
Victor Stinnerd88d9832011-09-06 02:00:05 +02002050 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002051 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002052 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002053static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002054unicode_aswidechar(PyUnicodeObject *unicode,
2055 wchar_t *w,
2056 Py_ssize_t size)
2057{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002058 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 const wchar_t *wstr;
2060
2061 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2062 if (wstr == NULL)
2063 return -1;
2064
Victor Stinner5593d8a2010-10-02 11:11:27 +00002065 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002066 if (size > res)
2067 size = res + 1;
2068 else
2069 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002071 return res;
2072 }
2073 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002075}
2076
2077Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002078PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002079 wchar_t *w,
2080 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081{
2082 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002083 PyErr_BadInternalCall();
2084 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002086 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087}
2088
Victor Stinner137c34c2010-09-29 10:25:54 +00002089wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002090PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002091 Py_ssize_t *size)
2092{
2093 wchar_t* buffer;
2094 Py_ssize_t buflen;
2095
2096 if (unicode == NULL) {
2097 PyErr_BadInternalCall();
2098 return NULL;
2099 }
2100
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002101 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102 if (buflen == -1)
2103 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002104 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002105 PyErr_NoMemory();
2106 return NULL;
2107 }
2108
Victor Stinner137c34c2010-09-29 10:25:54 +00002109 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2110 if (buffer == NULL) {
2111 PyErr_NoMemory();
2112 return NULL;
2113 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002114 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002115 if (buflen == -1)
2116 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002117 if (size != NULL)
2118 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002119 return buffer;
2120}
2121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123
Alexander Belopolsky40018472011-02-26 01:02:56 +00002124PyObject *
2125PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002126{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002127 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002128 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002129 PyErr_SetString(PyExc_ValueError,
2130 "chr() arg not in range(0x110000)");
2131 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002132 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134 if (ordinal < 256)
2135 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137 v = PyUnicode_New(1, ordinal);
2138 if (v == NULL)
2139 return NULL;
2140 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2141 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002142}
2143
Alexander Belopolsky40018472011-02-26 01:02:56 +00002144PyObject *
2145PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002147 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002148 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002149 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002150 if (PyUnicode_READY(obj))
2151 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002152 Py_INCREF(obj);
2153 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002154 }
2155 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002156 /* For a Unicode subtype that's not a Unicode object,
2157 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002158 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002159 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002160 PyErr_Format(PyExc_TypeError,
2161 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002162 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002163 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002164}
2165
Alexander Belopolsky40018472011-02-26 01:02:56 +00002166PyObject *
2167PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002168 const char *encoding,
2169 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002170{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002171 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002172 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002173
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002175 PyErr_BadInternalCall();
2176 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002178
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002179 /* Decoding bytes objects is the most common case and should be fast */
2180 if (PyBytes_Check(obj)) {
2181 if (PyBytes_GET_SIZE(obj) == 0) {
2182 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002183 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002184 }
2185 else {
2186 v = PyUnicode_Decode(
2187 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2188 encoding, errors);
2189 }
2190 return v;
2191 }
2192
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002193 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002194 PyErr_SetString(PyExc_TypeError,
2195 "decoding str is not supported");
2196 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002197 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002198
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002199 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2200 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2201 PyErr_Format(PyExc_TypeError,
2202 "coercing to str: need bytes, bytearray "
2203 "or buffer-like object, %.80s found",
2204 Py_TYPE(obj)->tp_name);
2205 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002206 }
Tim Petersced69f82003-09-16 20:30:58 +00002207
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002208 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002209 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002210 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 }
Tim Petersced69f82003-09-16 20:30:58 +00002212 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002213 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002214
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002215 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002216 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217}
2218
Victor Stinner600d3be2010-06-10 12:00:55 +00002219/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002220 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2221 1 on success. */
2222static int
2223normalize_encoding(const char *encoding,
2224 char *lower,
2225 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002227 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002228 char *l;
2229 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002230
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002231 e = encoding;
2232 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002233 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002234 while (*e) {
2235 if (l == l_end)
2236 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002237 if (Py_ISUPPER(*e)) {
2238 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002239 }
2240 else if (*e == '_') {
2241 *l++ = '-';
2242 e++;
2243 }
2244 else {
2245 *l++ = *e++;
2246 }
2247 }
2248 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002249 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002250}
2251
Alexander Belopolsky40018472011-02-26 01:02:56 +00002252PyObject *
2253PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002254 Py_ssize_t size,
2255 const char *encoding,
2256 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002257{
2258 PyObject *buffer = NULL, *unicode;
2259 Py_buffer info;
2260 char lower[11]; /* Enough for any encoding shortcut */
2261
2262 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002263 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002264
2265 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002266 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002267 if ((strcmp(lower, "utf-8") == 0) ||
2268 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002269 return PyUnicode_DecodeUTF8(s, size, errors);
2270 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002271 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002272 (strcmp(lower, "iso-8859-1") == 0))
2273 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002274#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002275 else if (strcmp(lower, "mbcs") == 0)
2276 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002277#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002278 else if (strcmp(lower, "ascii") == 0)
2279 return PyUnicode_DecodeASCII(s, size, errors);
2280 else if (strcmp(lower, "utf-16") == 0)
2281 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2282 else if (strcmp(lower, "utf-32") == 0)
2283 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285
2286 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002287 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002288 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002289 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002290 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 if (buffer == NULL)
2292 goto onError;
2293 unicode = PyCodec_Decode(buffer, encoding, errors);
2294 if (unicode == NULL)
2295 goto onError;
2296 if (!PyUnicode_Check(unicode)) {
2297 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002298 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002299 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 Py_DECREF(unicode);
2301 goto onError;
2302 }
2303 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002304 if (PyUnicode_READY(unicode)) {
2305 Py_DECREF(unicode);
2306 return NULL;
2307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002309
Benjamin Peterson29060642009-01-31 22:14:21 +00002310 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002311 Py_XDECREF(buffer);
2312 return NULL;
2313}
2314
Alexander Belopolsky40018472011-02-26 01:02:56 +00002315PyObject *
2316PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002317 const char *encoding,
2318 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002319{
2320 PyObject *v;
2321
2322 if (!PyUnicode_Check(unicode)) {
2323 PyErr_BadArgument();
2324 goto onError;
2325 }
2326
2327 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002328 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002329
2330 /* Decode via the codec registry */
2331 v = PyCodec_Decode(unicode, encoding, errors);
2332 if (v == NULL)
2333 goto onError;
2334 return v;
2335
Benjamin Peterson29060642009-01-31 22:14:21 +00002336 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002337 return NULL;
2338}
2339
Alexander Belopolsky40018472011-02-26 01:02:56 +00002340PyObject *
2341PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002342 const char *encoding,
2343 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002344{
2345 PyObject *v;
2346
2347 if (!PyUnicode_Check(unicode)) {
2348 PyErr_BadArgument();
2349 goto onError;
2350 }
2351
2352 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002353 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002354
2355 /* Decode via the codec registry */
2356 v = PyCodec_Decode(unicode, encoding, errors);
2357 if (v == NULL)
2358 goto onError;
2359 if (!PyUnicode_Check(v)) {
2360 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002361 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002362 Py_TYPE(v)->tp_name);
2363 Py_DECREF(v);
2364 goto onError;
2365 }
2366 return v;
2367
Benjamin Peterson29060642009-01-31 22:14:21 +00002368 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002369 return NULL;
2370}
2371
Alexander Belopolsky40018472011-02-26 01:02:56 +00002372PyObject *
2373PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002374 Py_ssize_t size,
2375 const char *encoding,
2376 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377{
2378 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002379
Guido van Rossumd57fd912000-03-10 22:53:23 +00002380 unicode = PyUnicode_FromUnicode(s, size);
2381 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002382 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002383 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2384 Py_DECREF(unicode);
2385 return v;
2386}
2387
Alexander Belopolsky40018472011-02-26 01:02:56 +00002388PyObject *
2389PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002390 const char *encoding,
2391 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002392{
2393 PyObject *v;
2394
2395 if (!PyUnicode_Check(unicode)) {
2396 PyErr_BadArgument();
2397 goto onError;
2398 }
2399
2400 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002401 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002402
2403 /* Encode via the codec registry */
2404 v = PyCodec_Encode(unicode, encoding, errors);
2405 if (v == NULL)
2406 goto onError;
2407 return v;
2408
Benjamin Peterson29060642009-01-31 22:14:21 +00002409 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002410 return NULL;
2411}
2412
Victor Stinnerad158722010-10-27 00:25:46 +00002413PyObject *
2414PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002415{
Victor Stinner99b95382011-07-04 14:23:54 +02002416#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002417 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2418 PyUnicode_GET_SIZE(unicode),
2419 NULL);
2420#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002422#else
Victor Stinner793b5312011-04-27 00:24:21 +02002423 PyInterpreterState *interp = PyThreadState_GET()->interp;
2424 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2425 cannot use it to encode and decode filenames before it is loaded. Load
2426 the Python codec requires to encode at least its own filename. Use the C
2427 version of the locale codec until the codec registry is initialized and
2428 the Python codec is loaded.
2429
2430 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2431 cannot only rely on it: check also interp->fscodec_initialized for
2432 subinterpreters. */
2433 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002434 return PyUnicode_AsEncodedString(unicode,
2435 Py_FileSystemDefaultEncoding,
2436 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002437 }
2438 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002439 /* locale encoding with surrogateescape */
2440 wchar_t *wchar;
2441 char *bytes;
2442 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002443 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002444
2445 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2446 if (wchar == NULL)
2447 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002448 bytes = _Py_wchar2char(wchar, &error_pos);
2449 if (bytes == NULL) {
2450 if (error_pos != (size_t)-1) {
2451 char *errmsg = strerror(errno);
2452 PyObject *exc = NULL;
2453 if (errmsg == NULL)
2454 errmsg = "Py_wchar2char() failed";
2455 raise_encode_exception(&exc,
2456 "filesystemencoding",
2457 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2458 error_pos, error_pos+1,
2459 errmsg);
2460 Py_XDECREF(exc);
2461 }
2462 else
2463 PyErr_NoMemory();
2464 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002465 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002466 }
2467 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002468
2469 bytes_obj = PyBytes_FromString(bytes);
2470 PyMem_Free(bytes);
2471 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002472 }
Victor Stinnerad158722010-10-27 00:25:46 +00002473#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002474}
2475
Alexander Belopolsky40018472011-02-26 01:02:56 +00002476PyObject *
2477PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002478 const char *encoding,
2479 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480{
2481 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002482 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002483
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 if (!PyUnicode_Check(unicode)) {
2485 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 }
Fred Drakee4315f52000-05-09 19:53:39 +00002488
Victor Stinner2f283c22011-03-02 01:21:46 +00002489 if (encoding == NULL) {
2490 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002492 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002494 }
Fred Drakee4315f52000-05-09 19:53:39 +00002495
2496 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002497 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002498 if ((strcmp(lower, "utf-8") == 0) ||
2499 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002500 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002501 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002503 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002505 }
Victor Stinner37296e82010-06-10 13:36:23 +00002506 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002507 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002508 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002509 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002510#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002511 else if (strcmp(lower, "mbcs") == 0)
2512 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2513 PyUnicode_GET_SIZE(unicode),
2514 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002515#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002516 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519
2520 /* Encode via the codec registry */
2521 v = PyCodec_Encode(unicode, encoding, errors);
2522 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002523 return NULL;
2524
2525 /* The normal path */
2526 if (PyBytes_Check(v))
2527 return v;
2528
2529 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002530 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002531 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002532 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002533
2534 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2535 "encoder %s returned bytearray instead of bytes",
2536 encoding);
2537 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002538 Py_DECREF(v);
2539 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002540 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002541
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002542 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2543 Py_DECREF(v);
2544 return b;
2545 }
2546
2547 PyErr_Format(PyExc_TypeError,
2548 "encoder did not return a bytes object (type=%.400s)",
2549 Py_TYPE(v)->tp_name);
2550 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002551 return NULL;
2552}
2553
Alexander Belopolsky40018472011-02-26 01:02:56 +00002554PyObject *
2555PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002556 const char *encoding,
2557 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002558{
2559 PyObject *v;
2560
2561 if (!PyUnicode_Check(unicode)) {
2562 PyErr_BadArgument();
2563 goto onError;
2564 }
2565
2566 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002567 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002568
2569 /* Encode via the codec registry */
2570 v = PyCodec_Encode(unicode, encoding, errors);
2571 if (v == NULL)
2572 goto onError;
2573 if (!PyUnicode_Check(v)) {
2574 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002575 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002576 Py_TYPE(v)->tp_name);
2577 Py_DECREF(v);
2578 goto onError;
2579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002581
Benjamin Peterson29060642009-01-31 22:14:21 +00002582 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583 return NULL;
2584}
2585
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002586PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002587PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002588 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002589 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2590}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002591
Christian Heimes5894ba72007-11-04 11:43:14 +00002592PyObject*
2593PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2594{
Victor Stinner99b95382011-07-04 14:23:54 +02002595#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002596 return PyUnicode_DecodeMBCS(s, size, NULL);
2597#elif defined(__APPLE__)
2598 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2599#else
Victor Stinner793b5312011-04-27 00:24:21 +02002600 PyInterpreterState *interp = PyThreadState_GET()->interp;
2601 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2602 cannot use it to encode and decode filenames before it is loaded. Load
2603 the Python codec requires to encode at least its own filename. Use the C
2604 version of the locale codec until the codec registry is initialized and
2605 the Python codec is loaded.
2606
2607 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2608 cannot only rely on it: check also interp->fscodec_initialized for
2609 subinterpreters. */
2610 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002611 return PyUnicode_Decode(s, size,
2612 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002613 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002614 }
2615 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002616 /* locale encoding with surrogateescape */
2617 wchar_t *wchar;
2618 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002619 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002620
2621 if (s[size] != '\0' || size != strlen(s)) {
2622 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2623 return NULL;
2624 }
2625
Victor Stinner168e1172010-10-16 23:16:16 +00002626 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002627 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002628 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002629
Victor Stinner168e1172010-10-16 23:16:16 +00002630 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002631 PyMem_Free(wchar);
2632 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002633 }
Victor Stinnerad158722010-10-27 00:25:46 +00002634#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002635}
2636
Martin v. Löwis011e8422009-05-05 04:43:17 +00002637
2638int
2639PyUnicode_FSConverter(PyObject* arg, void* addr)
2640{
2641 PyObject *output = NULL;
2642 Py_ssize_t size;
2643 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002644 if (arg == NULL) {
2645 Py_DECREF(*(PyObject**)addr);
2646 return 1;
2647 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002648 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002649 output = arg;
2650 Py_INCREF(output);
2651 }
2652 else {
2653 arg = PyUnicode_FromObject(arg);
2654 if (!arg)
2655 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002656 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002657 Py_DECREF(arg);
2658 if (!output)
2659 return 0;
2660 if (!PyBytes_Check(output)) {
2661 Py_DECREF(output);
2662 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2663 return 0;
2664 }
2665 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002666 size = PyBytes_GET_SIZE(output);
2667 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002668 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002669 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002670 Py_DECREF(output);
2671 return 0;
2672 }
2673 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002674 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002675}
2676
2677
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002678int
2679PyUnicode_FSDecoder(PyObject* arg, void* addr)
2680{
2681 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002682 if (arg == NULL) {
2683 Py_DECREF(*(PyObject**)addr);
2684 return 1;
2685 }
2686 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002687 if (PyUnicode_READY(arg))
2688 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002689 output = arg;
2690 Py_INCREF(output);
2691 }
2692 else {
2693 arg = PyBytes_FromObject(arg);
2694 if (!arg)
2695 return 0;
2696 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2697 PyBytes_GET_SIZE(arg));
2698 Py_DECREF(arg);
2699 if (!output)
2700 return 0;
2701 if (!PyUnicode_Check(output)) {
2702 Py_DECREF(output);
2703 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2704 return 0;
2705 }
2706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2708 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002709 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2710 Py_DECREF(output);
2711 return 0;
2712 }
2713 *(PyObject**)addr = output;
2714 return Py_CLEANUP_SUPPORTED;
2715}
2716
2717
Martin v. Löwis5b222132007-06-10 09:51:05 +00002718char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002719PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002720{
Christian Heimesf3863112007-11-22 07:46:41 +00002721 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002722 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2723
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002724 if (!PyUnicode_Check(unicode)) {
2725 PyErr_BadArgument();
2726 return NULL;
2727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002728 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002729 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002730
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002731 if (PyUnicode_UTF8(unicode) == NULL) {
2732 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002733 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2734 if (bytes == NULL)
2735 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002736 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2737 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 Py_DECREF(bytes);
2739 return NULL;
2740 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002741 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2742 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 Py_DECREF(bytes);
2744 }
2745
2746 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002747 *psize = PyUnicode_UTF8_LENGTH(unicode);
2748 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002749}
2750
2751char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002752PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002753{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002754 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2755}
2756
2757#ifdef Py_DEBUG
2758int unicode_as_unicode_calls = 0;
2759#endif
2760
2761
2762Py_UNICODE *
2763PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2764{
2765 PyUnicodeObject *u;
2766 const unsigned char *one_byte;
2767#if SIZEOF_WCHAR_T == 4
2768 const Py_UCS2 *two_bytes;
2769#else
2770 const Py_UCS4 *four_bytes;
2771 const Py_UCS4 *ucs4_end;
2772 Py_ssize_t num_surrogates;
2773#endif
2774 wchar_t *w;
2775 wchar_t *wchar_end;
2776
2777 if (!PyUnicode_Check(unicode)) {
2778 PyErr_BadArgument();
2779 return NULL;
2780 }
2781 u = (PyUnicodeObject*)unicode;
2782 if (_PyUnicode_WSTR(u) == NULL) {
2783 /* Non-ASCII compact unicode object */
2784 assert(_PyUnicode_KIND(u) != 0);
2785 assert(PyUnicode_IS_READY(u));
2786
2787#ifdef Py_DEBUG
2788 ++unicode_as_unicode_calls;
2789#endif
2790
2791 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2792#if SIZEOF_WCHAR_T == 2
2793 four_bytes = PyUnicode_4BYTE_DATA(u);
2794 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2795 num_surrogates = 0;
2796
2797 for (; four_bytes < ucs4_end; ++four_bytes) {
2798 if (*four_bytes > 0xFFFF)
2799 ++num_surrogates;
2800 }
2801
2802 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2803 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2804 if (!_PyUnicode_WSTR(u)) {
2805 PyErr_NoMemory();
2806 return NULL;
2807 }
2808 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2809
2810 w = _PyUnicode_WSTR(u);
2811 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2812 four_bytes = PyUnicode_4BYTE_DATA(u);
2813 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2814 if (*four_bytes > 0xFFFF) {
2815 /* encode surrogate pair in this case */
2816 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2817 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2818 }
2819 else
2820 *w = *four_bytes;
2821
2822 if (w > wchar_end) {
2823 assert(0 && "Miscalculated string end");
2824 }
2825 }
2826 *w = 0;
2827#else
2828 /* sizeof(wchar_t) == 4 */
2829 Py_FatalError("Impossible unicode object state, wstr and str "
2830 "should share memory already.");
2831 return NULL;
2832#endif
2833 }
2834 else {
2835 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2836 (_PyUnicode_LENGTH(u) + 1));
2837 if (!_PyUnicode_WSTR(u)) {
2838 PyErr_NoMemory();
2839 return NULL;
2840 }
2841 if (!PyUnicode_IS_COMPACT_ASCII(u))
2842 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2843 w = _PyUnicode_WSTR(u);
2844 wchar_end = w + _PyUnicode_LENGTH(u);
2845
2846 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2847 one_byte = PyUnicode_1BYTE_DATA(u);
2848 for (; w < wchar_end; ++one_byte, ++w)
2849 *w = *one_byte;
2850 /* null-terminate the wstr */
2851 *w = 0;
2852 }
2853 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2854#if SIZEOF_WCHAR_T == 4
2855 two_bytes = PyUnicode_2BYTE_DATA(u);
2856 for (; w < wchar_end; ++two_bytes, ++w)
2857 *w = *two_bytes;
2858 /* null-terminate the wstr */
2859 *w = 0;
2860#else
2861 /* sizeof(wchar_t) == 2 */
2862 PyObject_FREE(_PyUnicode_WSTR(u));
2863 _PyUnicode_WSTR(u) = NULL;
2864 Py_FatalError("Impossible unicode object state, wstr "
2865 "and str should share memory already.");
2866 return NULL;
2867#endif
2868 }
2869 else {
2870 assert(0 && "This should never happen.");
2871 }
2872 }
2873 }
2874 if (size != NULL)
2875 *size = PyUnicode_WSTR_LENGTH(u);
2876 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002877}
2878
Alexander Belopolsky40018472011-02-26 01:02:56 +00002879Py_UNICODE *
2880PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002882 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883}
2884
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002885
Alexander Belopolsky40018472011-02-26 01:02:56 +00002886Py_ssize_t
2887PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888{
2889 if (!PyUnicode_Check(unicode)) {
2890 PyErr_BadArgument();
2891 goto onError;
2892 }
2893 return PyUnicode_GET_SIZE(unicode);
2894
Benjamin Peterson29060642009-01-31 22:14:21 +00002895 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896 return -1;
2897}
2898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002899Py_ssize_t
2900PyUnicode_GetLength(PyObject *unicode)
2901{
Victor Stinner5a706cf2011-10-02 00:36:53 +02002902 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002903 PyErr_BadArgument();
2904 return -1;
2905 }
2906
2907 return PyUnicode_GET_LENGTH(unicode);
2908}
2909
2910Py_UCS4
2911PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2912{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02002913 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
2914 PyErr_BadArgument();
2915 return (Py_UCS4)-1;
2916 }
2917 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
2918 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002919 return (Py_UCS4)-1;
2920 }
2921 return PyUnicode_READ_CHAR(unicode, index);
2922}
2923
2924int
2925PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2926{
2927 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02002928 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002929 return -1;
2930 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02002931 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
2932 PyErr_SetString(PyExc_IndexError, "string index out of range");
2933 return -1;
2934 }
2935 if (_PyUnicode_Dirty(unicode))
2936 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002937 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2938 index, ch);
2939 return 0;
2940}
2941
Alexander Belopolsky40018472011-02-26 01:02:56 +00002942const char *
2943PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002944{
Victor Stinner42cb4622010-09-01 19:39:01 +00002945 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002946}
2947
Victor Stinner554f3f02010-06-16 23:33:54 +00002948/* create or adjust a UnicodeDecodeError */
2949static void
2950make_decode_exception(PyObject **exceptionObject,
2951 const char *encoding,
2952 const char *input, Py_ssize_t length,
2953 Py_ssize_t startpos, Py_ssize_t endpos,
2954 const char *reason)
2955{
2956 if (*exceptionObject == NULL) {
2957 *exceptionObject = PyUnicodeDecodeError_Create(
2958 encoding, input, length, startpos, endpos, reason);
2959 }
2960 else {
2961 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2962 goto onError;
2963 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2964 goto onError;
2965 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2966 goto onError;
2967 }
2968 return;
2969
2970onError:
2971 Py_DECREF(*exceptionObject);
2972 *exceptionObject = NULL;
2973}
2974
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002975/* error handling callback helper:
2976 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002977 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002978 and adjust various state variables.
2979 return 0 on success, -1 on error
2980*/
2981
Alexander Belopolsky40018472011-02-26 01:02:56 +00002982static int
2983unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002984 const char *encoding, const char *reason,
2985 const char **input, const char **inend, Py_ssize_t *startinpos,
2986 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2987 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002988{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002989 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002990
2991 PyObject *restuple = NULL;
2992 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002993 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002994 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002995 Py_ssize_t requiredsize;
2996 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002997 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002998 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002999 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000 int res = -1;
3001
3002 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 *errorHandler = PyCodec_LookupError(errors);
3004 if (*errorHandler == NULL)
3005 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003006 }
3007
Victor Stinner554f3f02010-06-16 23:33:54 +00003008 make_decode_exception(exceptionObject,
3009 encoding,
3010 *input, *inend - *input,
3011 *startinpos, *endinpos,
3012 reason);
3013 if (*exceptionObject == NULL)
3014 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003015
3016 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3017 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003018 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003019 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003020 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003021 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022 }
3023 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003024 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003025
3026 /* Copy back the bytes variables, which might have been modified by the
3027 callback */
3028 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3029 if (!inputobj)
3030 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003031 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003033 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003034 *input = PyBytes_AS_STRING(inputobj);
3035 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003036 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003037 /* we can DECREF safely, as the exception has another reference,
3038 so the object won't go away. */
3039 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003043 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003044 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3045 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003046 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003047
3048 /* need more space? (at least enough for what we
3049 have+the replacement+the rest of the string (starting
3050 at the new input position), so we won't have to check space
3051 when there are no errors in the rest of the string) */
3052 repptr = PyUnicode_AS_UNICODE(repunicode);
3053 repsize = PyUnicode_GET_SIZE(repunicode);
3054 requiredsize = *outpos + repsize + insize-newpos;
3055 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 if (requiredsize<2*outsize)
3057 requiredsize = 2*outsize;
3058 if (_PyUnicode_Resize(output, requiredsize) < 0)
3059 goto onError;
3060 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003061 }
3062 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003063 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 Py_UNICODE_COPY(*outptr, repptr, repsize);
3065 *outptr += repsize;
3066 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 /* we made it! */
3069 res = 0;
3070
Benjamin Peterson29060642009-01-31 22:14:21 +00003071 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003072 Py_XDECREF(restuple);
3073 return res;
3074}
3075
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003076/* --- UTF-7 Codec -------------------------------------------------------- */
3077
Antoine Pitrou244651a2009-05-04 18:56:13 +00003078/* See RFC2152 for details. We encode conservatively and decode liberally. */
3079
3080/* Three simple macros defining base-64. */
3081
3082/* Is c a base-64 character? */
3083
3084#define IS_BASE64(c) \
3085 (((c) >= 'A' && (c) <= 'Z') || \
3086 ((c) >= 'a' && (c) <= 'z') || \
3087 ((c) >= '0' && (c) <= '9') || \
3088 (c) == '+' || (c) == '/')
3089
3090/* given that c is a base-64 character, what is its base-64 value? */
3091
3092#define FROM_BASE64(c) \
3093 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3094 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3095 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3096 (c) == '+' ? 62 : 63)
3097
3098/* What is the base-64 character of the bottom 6 bits of n? */
3099
3100#define TO_BASE64(n) \
3101 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3102
3103/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3104 * decoded as itself. We are permissive on decoding; the only ASCII
3105 * byte not decoding to itself is the + which begins a base64
3106 * string. */
3107
3108#define DECODE_DIRECT(c) \
3109 ((c) <= 127 && (c) != '+')
3110
3111/* The UTF-7 encoder treats ASCII characters differently according to
3112 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3113 * the above). See RFC2152. This array identifies these different
3114 * sets:
3115 * 0 : "Set D"
3116 * alphanumeric and '(),-./:?
3117 * 1 : "Set O"
3118 * !"#$%&*;<=>@[]^_`{|}
3119 * 2 : "whitespace"
3120 * ht nl cr sp
3121 * 3 : special (must be base64 encoded)
3122 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3123 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003124
Tim Petersced69f82003-09-16 20:30:58 +00003125static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003126char utf7_category[128] = {
3127/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3128 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3129/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3130 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3131/* sp ! " # $ % & ' ( ) * + , - . / */
3132 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3133/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3134 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3135/* @ A B C D E F G H I J K L M N O */
3136 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3137/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3138 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3139/* ` a b c d e f g h i j k l m n o */
3140 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3141/* p q r s t u v w x y z { | } ~ del */
3142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003143};
3144
Antoine Pitrou244651a2009-05-04 18:56:13 +00003145/* ENCODE_DIRECT: this character should be encoded as itself. The
3146 * answer depends on whether we are encoding set O as itself, and also
3147 * on whether we are encoding whitespace as itself. RFC2152 makes it
3148 * clear that the answers to these questions vary between
3149 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003150
Antoine Pitrou244651a2009-05-04 18:56:13 +00003151#define ENCODE_DIRECT(c, directO, directWS) \
3152 ((c) < 128 && (c) > 0 && \
3153 ((utf7_category[(c)] == 0) || \
3154 (directWS && (utf7_category[(c)] == 2)) || \
3155 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003156
Alexander Belopolsky40018472011-02-26 01:02:56 +00003157PyObject *
3158PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003159 Py_ssize_t size,
3160 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003161{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003162 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3163}
3164
Antoine Pitrou244651a2009-05-04 18:56:13 +00003165/* The decoder. The only state we preserve is our read position,
3166 * i.e. how many characters we have consumed. So if we end in the
3167 * middle of a shift sequence we have to back off the read position
3168 * and the output to the beginning of the sequence, otherwise we lose
3169 * all the shift state (seen bits, number of bits seen, high
3170 * surrogate). */
3171
Alexander Belopolsky40018472011-02-26 01:02:56 +00003172PyObject *
3173PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003174 Py_ssize_t size,
3175 const char *errors,
3176 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003177{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003178 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003179 Py_ssize_t startinpos;
3180 Py_ssize_t endinpos;
3181 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003182 const char *e;
3183 PyUnicodeObject *unicode;
3184 Py_UNICODE *p;
3185 const char *errmsg = "";
3186 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003187 Py_UNICODE *shiftOutStart;
3188 unsigned int base64bits = 0;
3189 unsigned long base64buffer = 0;
3190 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003191 PyObject *errorHandler = NULL;
3192 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003193
3194 unicode = _PyUnicode_New(size);
3195 if (!unicode)
3196 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003197 if (size == 0) {
3198 if (consumed)
3199 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003200 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003201 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003203 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003204 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003205 e = s + size;
3206
3207 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003209 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003210 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003211
Antoine Pitrou244651a2009-05-04 18:56:13 +00003212 if (inShift) { /* in a base-64 section */
3213 if (IS_BASE64(ch)) { /* consume a base-64 character */
3214 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3215 base64bits += 6;
3216 s++;
3217 if (base64bits >= 16) {
3218 /* we have enough bits for a UTF-16 value */
3219 Py_UNICODE outCh = (Py_UNICODE)
3220 (base64buffer >> (base64bits-16));
3221 base64bits -= 16;
3222 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3223 if (surrogate) {
3224 /* expecting a second surrogate */
3225 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3226#ifdef Py_UNICODE_WIDE
3227 *p++ = (((surrogate & 0x3FF)<<10)
3228 | (outCh & 0x3FF)) + 0x10000;
3229#else
3230 *p++ = surrogate;
3231 *p++ = outCh;
3232#endif
3233 surrogate = 0;
3234 }
3235 else {
3236 surrogate = 0;
3237 errmsg = "second surrogate missing";
3238 goto utf7Error;
3239 }
3240 }
3241 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3242 /* first surrogate */
3243 surrogate = outCh;
3244 }
3245 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3246 errmsg = "unexpected second surrogate";
3247 goto utf7Error;
3248 }
3249 else {
3250 *p++ = outCh;
3251 }
3252 }
3253 }
3254 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003255 inShift = 0;
3256 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003257 if (surrogate) {
3258 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003259 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003260 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003261 if (base64bits > 0) { /* left-over bits */
3262 if (base64bits >= 6) {
3263 /* We've seen at least one base-64 character */
3264 errmsg = "partial character in shift sequence";
3265 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003266 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003267 else {
3268 /* Some bits remain; they should be zero */
3269 if (base64buffer != 0) {
3270 errmsg = "non-zero padding bits in shift sequence";
3271 goto utf7Error;
3272 }
3273 }
3274 }
3275 if (ch != '-') {
3276 /* '-' is absorbed; other terminating
3277 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003278 *p++ = ch;
3279 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003280 }
3281 }
3282 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003284 s++; /* consume '+' */
3285 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003286 s++;
3287 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003288 }
3289 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003290 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003291 shiftOutStart = p;
3292 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003293 }
3294 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003295 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003296 *p++ = ch;
3297 s++;
3298 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003299 else {
3300 startinpos = s-starts;
3301 s++;
3302 errmsg = "unexpected special character";
3303 goto utf7Error;
3304 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003305 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003306utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003307 outpos = p-PyUnicode_AS_UNICODE(unicode);
3308 endinpos = s-starts;
3309 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 errors, &errorHandler,
3311 "utf7", errmsg,
3312 &starts, &e, &startinpos, &endinpos, &exc, &s,
3313 &unicode, &outpos, &p))
3314 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003315 }
3316
Antoine Pitrou244651a2009-05-04 18:56:13 +00003317 /* end of string */
3318
3319 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3320 /* if we're in an inconsistent state, that's an error */
3321 if (surrogate ||
3322 (base64bits >= 6) ||
3323 (base64bits > 0 && base64buffer != 0)) {
3324 outpos = p-PyUnicode_AS_UNICODE(unicode);
3325 endinpos = size;
3326 if (unicode_decode_call_errorhandler(
3327 errors, &errorHandler,
3328 "utf7", "unterminated shift sequence",
3329 &starts, &e, &startinpos, &endinpos, &exc, &s,
3330 &unicode, &outpos, &p))
3331 goto onError;
3332 if (s < e)
3333 goto restart;
3334 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003335 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003336
3337 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003338 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003339 if (inShift) {
3340 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003341 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003342 }
3343 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003344 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003345 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003346 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003347
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003348 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003349 goto onError;
3350
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003351 Py_XDECREF(errorHandler);
3352 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003353 if (PyUnicode_READY(unicode) == -1) {
3354 Py_DECREF(unicode);
3355 return NULL;
3356 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003357 return (PyObject *)unicode;
3358
Benjamin Peterson29060642009-01-31 22:14:21 +00003359 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003360 Py_XDECREF(errorHandler);
3361 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003362 Py_DECREF(unicode);
3363 return NULL;
3364}
3365
3366
Alexander Belopolsky40018472011-02-26 01:02:56 +00003367PyObject *
3368PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003369 Py_ssize_t size,
3370 int base64SetO,
3371 int base64WhiteSpace,
3372 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003373{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003374 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003375 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003376 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003377 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003378 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003379 unsigned int base64bits = 0;
3380 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003381 char * out;
3382 char * start;
3383
3384 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003385 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003386
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003387 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003388 return PyErr_NoMemory();
3389
Antoine Pitrou244651a2009-05-04 18:56:13 +00003390 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003391 if (v == NULL)
3392 return NULL;
3393
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003394 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003395 for (;i < size; ++i) {
3396 Py_UNICODE ch = s[i];
3397
Antoine Pitrou244651a2009-05-04 18:56:13 +00003398 if (inShift) {
3399 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3400 /* shifting out */
3401 if (base64bits) { /* output remaining bits */
3402 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3403 base64buffer = 0;
3404 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003405 }
3406 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003407 /* Characters not in the BASE64 set implicitly unshift the sequence
3408 so no '-' is required, except if the character is itself a '-' */
3409 if (IS_BASE64(ch) || ch == '-') {
3410 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003411 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003412 *out++ = (char) ch;
3413 }
3414 else {
3415 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003416 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003417 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003418 else { /* not in a shift sequence */
3419 if (ch == '+') {
3420 *out++ = '+';
3421 *out++ = '-';
3422 }
3423 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3424 *out++ = (char) ch;
3425 }
3426 else {
3427 *out++ = '+';
3428 inShift = 1;
3429 goto encode_char;
3430 }
3431 }
3432 continue;
3433encode_char:
3434#ifdef Py_UNICODE_WIDE
3435 if (ch >= 0x10000) {
3436 /* code first surrogate */
3437 base64bits += 16;
3438 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3439 while (base64bits >= 6) {
3440 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3441 base64bits -= 6;
3442 }
3443 /* prepare second surrogate */
3444 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3445 }
3446#endif
3447 base64bits += 16;
3448 base64buffer = (base64buffer << 16) | ch;
3449 while (base64bits >= 6) {
3450 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3451 base64bits -= 6;
3452 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003453 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003454 if (base64bits)
3455 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3456 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003457 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003458 if (_PyBytes_Resize(&v, out - start) < 0)
3459 return NULL;
3460 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003461}
3462
Antoine Pitrou244651a2009-05-04 18:56:13 +00003463#undef IS_BASE64
3464#undef FROM_BASE64
3465#undef TO_BASE64
3466#undef DECODE_DIRECT
3467#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003468
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469/* --- UTF-8 Codec -------------------------------------------------------- */
3470
Tim Petersced69f82003-09-16 20:30:58 +00003471static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003473 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3474 illegal prefix. See RFC 3629 for details */
3475 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3476 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003477 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3479 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3480 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3481 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003482 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3487 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3488 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3489 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3490 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491};
3492
Alexander Belopolsky40018472011-02-26 01:02:56 +00003493PyObject *
3494PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003495 Py_ssize_t size,
3496 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497{
Walter Dörwald69652032004-09-07 20:24:22 +00003498 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3499}
3500
Antoine Pitrouab868312009-01-10 15:40:25 +00003501/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3502#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3503
3504/* Mask to quickly check whether a C 'long' contains a
3505 non-ASCII, UTF8-encoded char. */
3506#if (SIZEOF_LONG == 8)
3507# define ASCII_CHAR_MASK 0x8080808080808080L
3508#elif (SIZEOF_LONG == 4)
3509# define ASCII_CHAR_MASK 0x80808080L
3510#else
3511# error C 'long' size should be either 4 or 8!
3512#endif
3513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003514/* Scans a UTF-8 string and returns the maximum character to be expected,
3515 the size of the decoded unicode string and if any major errors were
3516 encountered.
3517
3518 This function does check basic UTF-8 sanity, it does however NOT CHECK
3519 if the string contains surrogates, and if all continuation bytes are
3520 within the correct ranges, these checks are performed in
3521 PyUnicode_DecodeUTF8Stateful.
3522
3523 If it sets has_errors to 1, it means the value of unicode_size and max_char
3524 will be bogus and you should not rely on useful information in them.
3525 */
3526static Py_UCS4
3527utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3528 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3529 int *has_errors)
3530{
3531 Py_ssize_t n;
3532 Py_ssize_t char_count = 0;
3533 Py_UCS4 max_char = 127, new_max;
3534 Py_UCS4 upper_bound;
3535 const unsigned char *p = (const unsigned char *)s;
3536 const unsigned char *end = p + string_size;
3537 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3538 int err = 0;
3539
3540 for (; p < end && !err; ++p, ++char_count) {
3541 /* Only check value if it's not a ASCII char... */
3542 if (*p < 0x80) {
3543 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3544 an explanation. */
3545 if (!((size_t) p & LONG_PTR_MASK)) {
3546 /* Help register allocation */
3547 register const unsigned char *_p = p;
3548 while (_p < aligned_end) {
3549 unsigned long value = *(unsigned long *) _p;
3550 if (value & ASCII_CHAR_MASK)
3551 break;
3552 _p += SIZEOF_LONG;
3553 char_count += SIZEOF_LONG;
3554 }
3555 p = _p;
3556 if (p == end)
3557 break;
3558 }
3559 }
3560 if (*p >= 0x80) {
3561 n = utf8_code_length[*p];
3562 new_max = max_char;
3563 switch (n) {
3564 /* invalid start byte */
3565 case 0:
3566 err = 1;
3567 break;
3568 case 2:
3569 /* Code points between 0x00FF and 0x07FF inclusive.
3570 Approximate the upper bound of the code point,
3571 if this flips over 255 we can be sure it will be more
3572 than 255 and the string will need 2 bytes per code coint,
3573 if it stays under or equal to 255, we can be sure 1 byte
3574 is enough.
3575 ((*p & 0b00011111) << 6) | 0b00111111 */
3576 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3577 if (max_char < upper_bound)
3578 new_max = upper_bound;
3579 /* Ensure we track at least that we left ASCII space. */
3580 if (new_max < 128)
3581 new_max = 128;
3582 break;
3583 case 3:
3584 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3585 always > 255 and <= 65535 and will always need 2 bytes. */
3586 if (max_char < 65535)
3587 new_max = 65535;
3588 break;
3589 case 4:
3590 /* Code point will be above 0xFFFF for sure in this case. */
3591 new_max = 65537;
3592 break;
3593 /* Internal error, this should be caught by the first if */
3594 case 1:
3595 default:
3596 assert(0 && "Impossible case in utf8_max_char_and_size");
3597 err = 1;
3598 }
3599 /* Instead of number of overall bytes for this code point,
3600 n containts the number of following bytes: */
3601 --n;
3602 /* Check if the follow up chars are all valid continuation bytes */
3603 if (n >= 1) {
3604 const unsigned char *cont;
3605 if ((p + n) >= end) {
3606 if (consumed == 0)
3607 /* incomplete data, non-incremental decoding */
3608 err = 1;
3609 break;
3610 }
3611 for (cont = p + 1; cont < (p + n); ++cont) {
3612 if ((*cont & 0xc0) != 0x80) {
3613 err = 1;
3614 break;
3615 }
3616 }
3617 p += n;
3618 }
3619 else
3620 err = 1;
3621 max_char = new_max;
3622 }
3623 }
3624
3625 if (unicode_size)
3626 *unicode_size = char_count;
3627 if (has_errors)
3628 *has_errors = err;
3629 return max_char;
3630}
3631
3632/* Similar to PyUnicode_WRITE but can also write into wstr field
3633 of the legacy unicode representation */
3634#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3635 do { \
3636 const int k_ = (kind); \
3637 if (k_ == PyUnicode_WCHAR_KIND) \
3638 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3639 else if (k_ == PyUnicode_1BYTE_KIND) \
3640 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3641 else if (k_ == PyUnicode_2BYTE_KIND) \
3642 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3643 else \
3644 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3645 } while (0)
3646
Alexander Belopolsky40018472011-02-26 01:02:56 +00003647PyObject *
3648PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003649 Py_ssize_t size,
3650 const char *errors,
3651 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003652{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003655 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003656 Py_ssize_t startinpos;
3657 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003658 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003660 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 PyObject *errorHandler = NULL;
3662 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003663 Py_UCS4 maxchar = 0;
3664 Py_ssize_t unicode_size;
3665 Py_ssize_t i;
3666 int kind;
3667 void *data;
3668 int has_errors;
3669 Py_UNICODE *error_outptr;
3670#if SIZEOF_WCHAR_T == 2
3671 Py_ssize_t wchar_offset = 0;
3672#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673
Walter Dörwald69652032004-09-07 20:24:22 +00003674 if (size == 0) {
3675 if (consumed)
3676 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003677 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003679 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3680 consumed, &has_errors);
3681 if (has_errors) {
3682 unicode = _PyUnicode_New(size);
3683 if (!unicode)
3684 return NULL;
3685 kind = PyUnicode_WCHAR_KIND;
3686 data = PyUnicode_AS_UNICODE(unicode);
3687 assert(data != NULL);
3688 }
3689 else {
3690 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3691 if (!unicode)
3692 return NULL;
3693 /* When the string is ASCII only, just use memcpy and return.
3694 unicode_size may be != size if there is an incomplete UTF-8
3695 sequence at the end of the ASCII block. */
3696 if (maxchar < 128 && size == unicode_size) {
3697 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3698 return (PyObject *)unicode;
3699 }
3700 kind = PyUnicode_KIND(unicode);
3701 data = PyUnicode_DATA(unicode);
3702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003704 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003706 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707
3708 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003709 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710
3711 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003712 /* Fast path for runs of ASCII characters. Given that common UTF-8
3713 input will consist of an overwhelming majority of ASCII
3714 characters, we try to optimize for this case by checking
3715 as many characters as a C 'long' can contain.
3716 First, check if we can do an aligned read, as most CPUs have
3717 a penalty for unaligned reads.
3718 */
3719 if (!((size_t) s & LONG_PTR_MASK)) {
3720 /* Help register allocation */
3721 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003722 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003723 while (_s < aligned_end) {
3724 /* Read a whole long at a time (either 4 or 8 bytes),
3725 and do a fast unrolled copy if it only contains ASCII
3726 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003727 unsigned long value = *(unsigned long *) _s;
3728 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003729 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003730 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3731 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3732 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3733 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003734#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003735 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3736 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3737 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3738 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003739#endif
3740 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003741 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003742 }
3743 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003744 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003745 if (s == e)
3746 break;
3747 ch = (unsigned char)*s;
3748 }
3749 }
3750
3751 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 s++;
3754 continue;
3755 }
3756
3757 n = utf8_code_length[ch];
3758
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003759 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003760 if (consumed)
3761 break;
3762 else {
3763 errmsg = "unexpected end of data";
3764 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003765 endinpos = startinpos+1;
3766 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3767 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003768 goto utf8Error;
3769 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771
3772 switch (n) {
3773
3774 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003775 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003776 startinpos = s-starts;
3777 endinpos = startinpos+1;
3778 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779
3780 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003781 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003782 startinpos = s-starts;
3783 endinpos = startinpos+1;
3784 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785
3786 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003787 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003788 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003789 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003790 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003791 goto utf8Error;
3792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003794 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003795 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 break;
3797
3798 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003799 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3800 will result in surrogates in range d800-dfff. Surrogates are
3801 not valid UTF-8 so they are rejected.
3802 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3803 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003804 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003805 (s[2] & 0xc0) != 0x80 ||
3806 ((unsigned char)s[0] == 0xE0 &&
3807 (unsigned char)s[1] < 0xA0) ||
3808 ((unsigned char)s[0] == 0xED &&
3809 (unsigned char)s[1] > 0x9F)) {
3810 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003811 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003812 endinpos = startinpos + 1;
3813
3814 /* if s[1] first two bits are 1 and 0, then the invalid
3815 continuation byte is s[2], so increment endinpos by 1,
3816 if not, s[1] is invalid and endinpos doesn't need to
3817 be incremented. */
3818 if ((s[1] & 0xC0) == 0x80)
3819 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003820 goto utf8Error;
3821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003823 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003825 break;
3826
3827 case 4:
3828 if ((s[1] & 0xc0) != 0x80 ||
3829 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003830 (s[3] & 0xc0) != 0x80 ||
3831 ((unsigned char)s[0] == 0xF0 &&
3832 (unsigned char)s[1] < 0x90) ||
3833 ((unsigned char)s[0] == 0xF4 &&
3834 (unsigned char)s[1] > 0x8F)) {
3835 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003836 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003837 endinpos = startinpos + 1;
3838 if ((s[1] & 0xC0) == 0x80) {
3839 endinpos++;
3840 if ((s[2] & 0xC0) == 0x80)
3841 endinpos++;
3842 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003843 goto utf8Error;
3844 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003845 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003846 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3847 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003849 /* If the string is flexible or we have native UCS-4, write
3850 directly.. */
3851 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3852 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 else {
3855 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857 /* translate from 10000..10FFFF to 0..FFFF */
3858 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860 /* high surrogate = top 10 bits added to D800 */
3861 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3862 (Py_UNICODE)(0xD800 + (ch >> 10)));
3863
3864 /* low surrogate = bottom 10 bits added to DC00 */
3865 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3866 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3867 }
3868#if SIZEOF_WCHAR_T == 2
3869 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003870#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872 }
3873 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003874 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003875
Benjamin Peterson29060642009-01-31 22:14:21 +00003876 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 /* If this is not yet a resizable string, make it one.. */
3878 if (kind != PyUnicode_WCHAR_KIND) {
3879 const Py_UNICODE *u;
3880 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3881 if (!new_unicode)
3882 goto onError;
3883 u = PyUnicode_AsUnicode((PyObject *)unicode);
3884 if (!u)
3885 goto onError;
3886#if SIZEOF_WCHAR_T == 2
3887 i += wchar_offset;
3888#endif
3889 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3890 Py_DECREF(unicode);
3891 unicode = new_unicode;
3892 kind = 0;
3893 data = PyUnicode_AS_UNICODE(new_unicode);
3894 assert(data != NULL);
3895 }
3896 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003897 if (unicode_decode_call_errorhandler(
3898 errors, &errorHandler,
3899 "utf8", errmsg,
3900 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003902 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903 /* Update data because unicode_decode_call_errorhandler might have
3904 re-created or resized the unicode object. */
3905 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003906 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908 /* Ensure the unicode_size calculation above was correct: */
3909 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3910
Walter Dörwald69652032004-09-07 20:24:22 +00003911 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003912 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003914 /* Adjust length and ready string when it contained errors and
3915 is of the old resizable kind. */
3916 if (kind == PyUnicode_WCHAR_KIND) {
3917 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3918 PyUnicode_READY(unicode) == -1)
3919 goto onError;
3920 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003922 Py_XDECREF(errorHandler);
3923 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003924 if (PyUnicode_READY(unicode) == -1) {
3925 Py_DECREF(unicode);
3926 return NULL;
3927 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 return (PyObject *)unicode;
3929
Benjamin Peterson29060642009-01-31 22:14:21 +00003930 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 Py_XDECREF(errorHandler);
3932 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 Py_DECREF(unicode);
3934 return NULL;
3935}
3936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003938
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003939#ifdef __APPLE__
3940
3941/* Simplified UTF-8 decoder using surrogateescape error handler,
3942 used to decode the command line arguments on Mac OS X. */
3943
3944wchar_t*
3945_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3946{
3947 int n;
3948 const char *e;
3949 wchar_t *unicode, *p;
3950
3951 /* Note: size will always be longer than the resulting Unicode
3952 character count */
3953 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3954 PyErr_NoMemory();
3955 return NULL;
3956 }
3957 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3958 if (!unicode)
3959 return NULL;
3960
3961 /* Unpack UTF-8 encoded data */
3962 p = unicode;
3963 e = s + size;
3964 while (s < e) {
3965 Py_UCS4 ch = (unsigned char)*s;
3966
3967 if (ch < 0x80) {
3968 *p++ = (wchar_t)ch;
3969 s++;
3970 continue;
3971 }
3972
3973 n = utf8_code_length[ch];
3974 if (s + n > e) {
3975 goto surrogateescape;
3976 }
3977
3978 switch (n) {
3979 case 0:
3980 case 1:
3981 goto surrogateescape;
3982
3983 case 2:
3984 if ((s[1] & 0xc0) != 0x80)
3985 goto surrogateescape;
3986 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3987 assert ((ch > 0x007F) && (ch <= 0x07FF));
3988 *p++ = (wchar_t)ch;
3989 break;
3990
3991 case 3:
3992 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3993 will result in surrogates in range d800-dfff. Surrogates are
3994 not valid UTF-8 so they are rejected.
3995 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3996 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3997 if ((s[1] & 0xc0) != 0x80 ||
3998 (s[2] & 0xc0) != 0x80 ||
3999 ((unsigned char)s[0] == 0xE0 &&
4000 (unsigned char)s[1] < 0xA0) ||
4001 ((unsigned char)s[0] == 0xED &&
4002 (unsigned char)s[1] > 0x9F)) {
4003
4004 goto surrogateescape;
4005 }
4006 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4007 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004009 break;
4010
4011 case 4:
4012 if ((s[1] & 0xc0) != 0x80 ||
4013 (s[2] & 0xc0) != 0x80 ||
4014 (s[3] & 0xc0) != 0x80 ||
4015 ((unsigned char)s[0] == 0xF0 &&
4016 (unsigned char)s[1] < 0x90) ||
4017 ((unsigned char)s[0] == 0xF4 &&
4018 (unsigned char)s[1] > 0x8F)) {
4019 goto surrogateescape;
4020 }
4021 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4022 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4023 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4024
4025#if SIZEOF_WCHAR_T == 4
4026 *p++ = (wchar_t)ch;
4027#else
4028 /* compute and append the two surrogates: */
4029
4030 /* translate from 10000..10FFFF to 0..FFFF */
4031 ch -= 0x10000;
4032
4033 /* high surrogate = top 10 bits added to D800 */
4034 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4035
4036 /* low surrogate = bottom 10 bits added to DC00 */
4037 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4038#endif
4039 break;
4040 }
4041 s += n;
4042 continue;
4043
4044 surrogateescape:
4045 *p++ = 0xDC00 + ch;
4046 s++;
4047 }
4048 *p = L'\0';
4049 return unicode;
4050}
4051
4052#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054/* Primary internal function which creates utf8 encoded bytes objects.
4055
4056 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004057 and allocate exactly as much space needed at the end. Else allocate the
4058 maximum possible needed (4 result bytes per Unicode character), and return
4059 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004060*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004061PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063{
Tim Peters602f7402002-04-27 18:03:26 +00004064#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004065
Guido van Rossum98297ee2007-11-06 21:34:58 +00004066 Py_ssize_t i; /* index into s of next input byte */
4067 PyObject *result; /* result string object */
4068 char *p; /* next free byte in output buffer */
4069 Py_ssize_t nallocated; /* number of result bytes allocated */
4070 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004071 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004072 PyObject *errorHandler = NULL;
4073 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004074 int kind;
4075 void *data;
4076 Py_ssize_t size;
4077 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4078#if SIZEOF_WCHAR_T == 2
4079 Py_ssize_t wchar_offset = 0;
4080#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 if (!PyUnicode_Check(unicode)) {
4083 PyErr_BadArgument();
4084 return NULL;
4085 }
4086
4087 if (PyUnicode_READY(unicode) == -1)
4088 return NULL;
4089
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004090 if (PyUnicode_UTF8(unicode))
4091 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4092 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093
4094 kind = PyUnicode_KIND(unicode);
4095 data = PyUnicode_DATA(unicode);
4096 size = PyUnicode_GET_LENGTH(unicode);
4097
Tim Peters602f7402002-04-27 18:03:26 +00004098 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099
Tim Peters602f7402002-04-27 18:03:26 +00004100 if (size <= MAX_SHORT_UNICHARS) {
4101 /* Write into the stack buffer; nallocated can't overflow.
4102 * At the end, we'll allocate exactly as much heap space as it
4103 * turns out we need.
4104 */
4105 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004106 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004107 p = stackbuf;
4108 }
4109 else {
4110 /* Overallocate on the heap, and give the excess back at the end. */
4111 nallocated = size * 4;
4112 if (nallocated / 4 != size) /* overflow! */
4113 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004114 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004115 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004116 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004117 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004118 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004119
Tim Peters602f7402002-04-27 18:03:26 +00004120 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004121 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004122
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004123 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004124 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004126
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004128 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004129 *p++ = (char)(0xc0 | (ch >> 6));
4130 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004131 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004132 Py_ssize_t newpos;
4133 PyObject *rep;
4134 Py_ssize_t repsize, k, startpos;
4135 startpos = i-1;
4136#if SIZEOF_WCHAR_T == 2
4137 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004138#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139 rep = unicode_encode_call_errorhandler(
4140 errors, &errorHandler, "utf-8", "surrogates not allowed",
4141 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4142 &exc, startpos, startpos+1, &newpos);
4143 if (!rep)
4144 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004146 if (PyBytes_Check(rep))
4147 repsize = PyBytes_GET_SIZE(rep);
4148 else
4149 repsize = PyUnicode_GET_SIZE(rep);
4150
4151 if (repsize > 4) {
4152 Py_ssize_t offset;
4153
4154 if (result == NULL)
4155 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004156 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004157 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004159 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4160 /* integer overflow */
4161 PyErr_NoMemory();
4162 goto error;
4163 }
4164 nallocated += repsize - 4;
4165 if (result != NULL) {
4166 if (_PyBytes_Resize(&result, nallocated) < 0)
4167 goto error;
4168 } else {
4169 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004170 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004171 goto error;
4172 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4173 }
4174 p = PyBytes_AS_STRING(result) + offset;
4175 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004177 if (PyBytes_Check(rep)) {
4178 char *prep = PyBytes_AS_STRING(rep);
4179 for(k = repsize; k > 0; k--)
4180 *p++ = *prep++;
4181 } else /* rep is unicode */ {
4182 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4183 Py_UNICODE c;
4184
4185 for(k=0; k<repsize; k++) {
4186 c = prep[k];
4187 if (0x80 <= c) {
4188 raise_encode_exception(&exc, "utf-8",
4189 PyUnicode_AS_UNICODE(unicode),
4190 size, i-1, i,
4191 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004192 goto error;
4193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004194 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004195 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004197 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004198 } else if (ch < 0x10000) {
4199 *p++ = (char)(0xe0 | (ch >> 12));
4200 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4201 *p++ = (char)(0x80 | (ch & 0x3f));
4202 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004203 /* Encode UCS4 Unicode ordinals */
4204 *p++ = (char)(0xf0 | (ch >> 18));
4205 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4206 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4207 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004208#if SIZEOF_WCHAR_T == 2
4209 wchar_offset++;
4210#endif
Tim Peters602f7402002-04-27 18:03:26 +00004211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004213
Guido van Rossum98297ee2007-11-06 21:34:58 +00004214 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004215 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004216 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004217 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004218 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004219 }
4220 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004221 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004222 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004223 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004224 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004225 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004226
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004227 Py_XDECREF(errorHandler);
4228 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004229 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004230 error:
4231 Py_XDECREF(errorHandler);
4232 Py_XDECREF(exc);
4233 Py_XDECREF(result);
4234 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004235
Tim Peters602f7402002-04-27 18:03:26 +00004236#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237}
4238
Alexander Belopolsky40018472011-02-26 01:02:56 +00004239PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004240PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4241 Py_ssize_t size,
4242 const char *errors)
4243{
4244 PyObject *v, *unicode;
4245
4246 unicode = PyUnicode_FromUnicode(s, size);
4247 if (unicode == NULL)
4248 return NULL;
4249 v = _PyUnicode_AsUTF8String(unicode, errors);
4250 Py_DECREF(unicode);
4251 return v;
4252}
4253
4254PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004255PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004257 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258}
4259
Walter Dörwald41980ca2007-08-16 21:55:45 +00004260/* --- UTF-32 Codec ------------------------------------------------------- */
4261
4262PyObject *
4263PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004264 Py_ssize_t size,
4265 const char *errors,
4266 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004267{
4268 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4269}
4270
4271PyObject *
4272PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004273 Py_ssize_t size,
4274 const char *errors,
4275 int *byteorder,
4276 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004277{
4278 const char *starts = s;
4279 Py_ssize_t startinpos;
4280 Py_ssize_t endinpos;
4281 Py_ssize_t outpos;
4282 PyUnicodeObject *unicode;
4283 Py_UNICODE *p;
4284#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004285 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004286 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004287#else
4288 const int pairs = 0;
4289#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004290 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004291 int bo = 0; /* assume native ordering by default */
4292 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004293 /* Offsets from q for retrieving bytes in the right order. */
4294#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4295 int iorder[] = {0, 1, 2, 3};
4296#else
4297 int iorder[] = {3, 2, 1, 0};
4298#endif
4299 PyObject *errorHandler = NULL;
4300 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004301
Walter Dörwald41980ca2007-08-16 21:55:45 +00004302 q = (unsigned char *)s;
4303 e = q + size;
4304
4305 if (byteorder)
4306 bo = *byteorder;
4307
4308 /* Check for BOM marks (U+FEFF) in the input and adjust current
4309 byte order setting accordingly. In native mode, the leading BOM
4310 mark is skipped, in all other modes, it is copied to the output
4311 stream as-is (giving a ZWNBSP character). */
4312 if (bo == 0) {
4313 if (size >= 4) {
4314 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004316#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004317 if (bom == 0x0000FEFF) {
4318 q += 4;
4319 bo = -1;
4320 }
4321 else if (bom == 0xFFFE0000) {
4322 q += 4;
4323 bo = 1;
4324 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004325#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004326 if (bom == 0x0000FEFF) {
4327 q += 4;
4328 bo = 1;
4329 }
4330 else if (bom == 0xFFFE0000) {
4331 q += 4;
4332 bo = -1;
4333 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004334#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004335 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004336 }
4337
4338 if (bo == -1) {
4339 /* force LE */
4340 iorder[0] = 0;
4341 iorder[1] = 1;
4342 iorder[2] = 2;
4343 iorder[3] = 3;
4344 }
4345 else if (bo == 1) {
4346 /* force BE */
4347 iorder[0] = 3;
4348 iorder[1] = 2;
4349 iorder[2] = 1;
4350 iorder[3] = 0;
4351 }
4352
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004353 /* On narrow builds we split characters outside the BMP into two
4354 codepoints => count how much extra space we need. */
4355#ifndef Py_UNICODE_WIDE
4356 for (qq = q; qq < e; qq += 4)
4357 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4358 pairs++;
4359#endif
4360
4361 /* This might be one to much, because of a BOM */
4362 unicode = _PyUnicode_New((size+3)/4+pairs);
4363 if (!unicode)
4364 return NULL;
4365 if (size == 0)
4366 return (PyObject *)unicode;
4367
4368 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004369 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004370
Walter Dörwald41980ca2007-08-16 21:55:45 +00004371 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 Py_UCS4 ch;
4373 /* remaining bytes at the end? (size should be divisible by 4) */
4374 if (e-q<4) {
4375 if (consumed)
4376 break;
4377 errmsg = "truncated data";
4378 startinpos = ((const char *)q)-starts;
4379 endinpos = ((const char *)e)-starts;
4380 goto utf32Error;
4381 /* The remaining input chars are ignored if the callback
4382 chooses to skip the input */
4383 }
4384 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4385 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004386
Benjamin Peterson29060642009-01-31 22:14:21 +00004387 if (ch >= 0x110000)
4388 {
4389 errmsg = "codepoint not in range(0x110000)";
4390 startinpos = ((const char *)q)-starts;
4391 endinpos = startinpos+4;
4392 goto utf32Error;
4393 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004394#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 if (ch >= 0x10000)
4396 {
4397 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4398 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4399 }
4400 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004401#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004402 *p++ = ch;
4403 q += 4;
4404 continue;
4405 utf32Error:
4406 outpos = p-PyUnicode_AS_UNICODE(unicode);
4407 if (unicode_decode_call_errorhandler(
4408 errors, &errorHandler,
4409 "utf32", errmsg,
4410 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4411 &unicode, &outpos, &p))
4412 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004413 }
4414
4415 if (byteorder)
4416 *byteorder = bo;
4417
4418 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004419 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004420
4421 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004422 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004423 goto onError;
4424
4425 Py_XDECREF(errorHandler);
4426 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004427 if (PyUnicode_READY(unicode) == -1) {
4428 Py_DECREF(unicode);
4429 return NULL;
4430 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004431 return (PyObject *)unicode;
4432
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004434 Py_DECREF(unicode);
4435 Py_XDECREF(errorHandler);
4436 Py_XDECREF(exc);
4437 return NULL;
4438}
4439
4440PyObject *
4441PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 Py_ssize_t size,
4443 const char *errors,
4444 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004445{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004446 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004447 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004448 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004449#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004450 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004451#else
4452 const int pairs = 0;
4453#endif
4454 /* Offsets from p for storing byte pairs in the right order. */
4455#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4456 int iorder[] = {0, 1, 2, 3};
4457#else
4458 int iorder[] = {3, 2, 1, 0};
4459#endif
4460
Benjamin Peterson29060642009-01-31 22:14:21 +00004461#define STORECHAR(CH) \
4462 do { \
4463 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4464 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4465 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4466 p[iorder[0]] = (CH) & 0xff; \
4467 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004468 } while(0)
4469
4470 /* In narrow builds we can output surrogate pairs as one codepoint,
4471 so we need less space. */
4472#ifndef Py_UNICODE_WIDE
4473 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4475 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4476 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004477#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004478 nsize = (size - pairs + (byteorder == 0));
4479 bytesize = nsize * 4;
4480 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004482 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004483 if (v == NULL)
4484 return NULL;
4485
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004486 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004487 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004489 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004490 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004491
4492 if (byteorder == -1) {
4493 /* force LE */
4494 iorder[0] = 0;
4495 iorder[1] = 1;
4496 iorder[2] = 2;
4497 iorder[3] = 3;
4498 }
4499 else if (byteorder == 1) {
4500 /* force BE */
4501 iorder[0] = 3;
4502 iorder[1] = 2;
4503 iorder[2] = 1;
4504 iorder[3] = 0;
4505 }
4506
4507 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004509#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004510 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4511 Py_UCS4 ch2 = *s;
4512 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4513 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4514 s++;
4515 size--;
4516 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004517 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004518#endif
4519 STORECHAR(ch);
4520 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004521
4522 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004523 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004524#undef STORECHAR
4525}
4526
Alexander Belopolsky40018472011-02-26 01:02:56 +00004527PyObject *
4528PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004529{
4530 if (!PyUnicode_Check(unicode)) {
4531 PyErr_BadArgument();
4532 return NULL;
4533 }
4534 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004535 PyUnicode_GET_SIZE(unicode),
4536 NULL,
4537 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004538}
4539
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540/* --- UTF-16 Codec ------------------------------------------------------- */
4541
Tim Peters772747b2001-08-09 22:21:55 +00004542PyObject *
4543PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 Py_ssize_t size,
4545 const char *errors,
4546 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547{
Walter Dörwald69652032004-09-07 20:24:22 +00004548 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4549}
4550
Antoine Pitrouab868312009-01-10 15:40:25 +00004551/* Two masks for fast checking of whether a C 'long' may contain
4552 UTF16-encoded surrogate characters. This is an efficient heuristic,
4553 assuming that non-surrogate characters with a code point >= 0x8000 are
4554 rare in most input.
4555 FAST_CHAR_MASK is used when the input is in native byte ordering,
4556 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004557*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004558#if (SIZEOF_LONG == 8)
4559# define FAST_CHAR_MASK 0x8000800080008000L
4560# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4561#elif (SIZEOF_LONG == 4)
4562# define FAST_CHAR_MASK 0x80008000L
4563# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4564#else
4565# error C 'long' size should be either 4 or 8!
4566#endif
4567
Walter Dörwald69652032004-09-07 20:24:22 +00004568PyObject *
4569PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 Py_ssize_t size,
4571 const char *errors,
4572 int *byteorder,
4573 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004574{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004575 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004576 Py_ssize_t startinpos;
4577 Py_ssize_t endinpos;
4578 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 PyUnicodeObject *unicode;
4580 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004581 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004582 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004583 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004584 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004585 /* Offsets from q for retrieving byte pairs in the right order. */
4586#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4587 int ihi = 1, ilo = 0;
4588#else
4589 int ihi = 0, ilo = 1;
4590#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 PyObject *errorHandler = NULL;
4592 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593
4594 /* Note: size will always be longer than the resulting Unicode
4595 character count */
4596 unicode = _PyUnicode_New(size);
4597 if (!unicode)
4598 return NULL;
4599 if (size == 0)
4600 return (PyObject *)unicode;
4601
4602 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004603 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004604 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004605 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606
4607 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004608 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004610 /* Check for BOM marks (U+FEFF) in the input and adjust current
4611 byte order setting accordingly. In native mode, the leading BOM
4612 mark is skipped, in all other modes, it is copied to the output
4613 stream as-is (giving a ZWNBSP character). */
4614 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004615 if (size >= 2) {
4616 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004617#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 if (bom == 0xFEFF) {
4619 q += 2;
4620 bo = -1;
4621 }
4622 else if (bom == 0xFFFE) {
4623 q += 2;
4624 bo = 1;
4625 }
Tim Petersced69f82003-09-16 20:30:58 +00004626#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 if (bom == 0xFEFF) {
4628 q += 2;
4629 bo = 1;
4630 }
4631 else if (bom == 0xFFFE) {
4632 q += 2;
4633 bo = -1;
4634 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004635#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638
Tim Peters772747b2001-08-09 22:21:55 +00004639 if (bo == -1) {
4640 /* force LE */
4641 ihi = 1;
4642 ilo = 0;
4643 }
4644 else if (bo == 1) {
4645 /* force BE */
4646 ihi = 0;
4647 ilo = 1;
4648 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004649#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4650 native_ordering = ilo < ihi;
4651#else
4652 native_ordering = ilo > ihi;
4653#endif
Tim Peters772747b2001-08-09 22:21:55 +00004654
Antoine Pitrouab868312009-01-10 15:40:25 +00004655 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004656 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004658 /* First check for possible aligned read of a C 'long'. Unaligned
4659 reads are more expensive, better to defer to another iteration. */
4660 if (!((size_t) q & LONG_PTR_MASK)) {
4661 /* Fast path for runs of non-surrogate chars. */
4662 register const unsigned char *_q = q;
4663 Py_UNICODE *_p = p;
4664 if (native_ordering) {
4665 /* Native ordering is simple: as long as the input cannot
4666 possibly contain a surrogate char, do an unrolled copy
4667 of several 16-bit code points to the target object.
4668 The non-surrogate check is done on several input bytes
4669 at a time (as many as a C 'long' can contain). */
4670 while (_q < aligned_end) {
4671 unsigned long data = * (unsigned long *) _q;
4672 if (data & FAST_CHAR_MASK)
4673 break;
4674 _p[0] = ((unsigned short *) _q)[0];
4675 _p[1] = ((unsigned short *) _q)[1];
4676#if (SIZEOF_LONG == 8)
4677 _p[2] = ((unsigned short *) _q)[2];
4678 _p[3] = ((unsigned short *) _q)[3];
4679#endif
4680 _q += SIZEOF_LONG;
4681 _p += SIZEOF_LONG / 2;
4682 }
4683 }
4684 else {
4685 /* Byteswapped ordering is similar, but we must decompose
4686 the copy bytewise, and take care of zero'ing out the
4687 upper bytes if the target object is in 32-bit units
4688 (that is, in UCS-4 builds). */
4689 while (_q < aligned_end) {
4690 unsigned long data = * (unsigned long *) _q;
4691 if (data & SWAPPED_FAST_CHAR_MASK)
4692 break;
4693 /* Zero upper bytes in UCS-4 builds */
4694#if (Py_UNICODE_SIZE > 2)
4695 _p[0] = 0;
4696 _p[1] = 0;
4697#if (SIZEOF_LONG == 8)
4698 _p[2] = 0;
4699 _p[3] = 0;
4700#endif
4701#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004702 /* Issue #4916; UCS-4 builds on big endian machines must
4703 fill the two last bytes of each 4-byte unit. */
4704#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4705# define OFF 2
4706#else
4707# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004708#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004709 ((unsigned char *) _p)[OFF + 1] = _q[0];
4710 ((unsigned char *) _p)[OFF + 0] = _q[1];
4711 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4712 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4713#if (SIZEOF_LONG == 8)
4714 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4715 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4716 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4717 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4718#endif
4719#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004720 _q += SIZEOF_LONG;
4721 _p += SIZEOF_LONG / 2;
4722 }
4723 }
4724 p = _p;
4725 q = _q;
4726 if (q >= e)
4727 break;
4728 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004730
Benjamin Peterson14339b62009-01-31 16:36:08 +00004731 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004732
4733 if (ch < 0xD800 || ch > 0xDFFF) {
4734 *p++ = ch;
4735 continue;
4736 }
4737
4738 /* UTF-16 code pair: */
4739 if (q > e) {
4740 errmsg = "unexpected end of data";
4741 startinpos = (((const char *)q) - 2) - starts;
4742 endinpos = ((const char *)e) + 1 - starts;
4743 goto utf16Error;
4744 }
4745 if (0xD800 <= ch && ch <= 0xDBFF) {
4746 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4747 q += 2;
4748 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004749#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 *p++ = ch;
4751 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004752#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004753 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004754#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004755 continue;
4756 }
4757 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004758 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004759 startinpos = (((const char *)q)-4)-starts;
4760 endinpos = startinpos+2;
4761 goto utf16Error;
4762 }
4763
Benjamin Peterson14339b62009-01-31 16:36:08 +00004764 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 errmsg = "illegal encoding";
4766 startinpos = (((const char *)q)-2)-starts;
4767 endinpos = startinpos+2;
4768 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004769
Benjamin Peterson29060642009-01-31 22:14:21 +00004770 utf16Error:
4771 outpos = p - PyUnicode_AS_UNICODE(unicode);
4772 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004773 errors,
4774 &errorHandler,
4775 "utf16", errmsg,
4776 &starts,
4777 (const char **)&e,
4778 &startinpos,
4779 &endinpos,
4780 &exc,
4781 (const char **)&q,
4782 &unicode,
4783 &outpos,
4784 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004785 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004787 /* remaining byte at the end? (size should be even) */
4788 if (e == q) {
4789 if (!consumed) {
4790 errmsg = "truncated data";
4791 startinpos = ((const char *)q) - starts;
4792 endinpos = ((const char *)e) + 1 - starts;
4793 outpos = p - PyUnicode_AS_UNICODE(unicode);
4794 if (unicode_decode_call_errorhandler(
4795 errors,
4796 &errorHandler,
4797 "utf16", errmsg,
4798 &starts,
4799 (const char **)&e,
4800 &startinpos,
4801 &endinpos,
4802 &exc,
4803 (const char **)&q,
4804 &unicode,
4805 &outpos,
4806 &p))
4807 goto onError;
4808 /* The remaining input chars are ignored if the callback
4809 chooses to skip the input */
4810 }
4811 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812
4813 if (byteorder)
4814 *byteorder = bo;
4815
Walter Dörwald69652032004-09-07 20:24:22 +00004816 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004817 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004818
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004820 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821 goto onError;
4822
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 Py_XDECREF(errorHandler);
4824 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004825 if (PyUnicode_READY(unicode) == -1) {
4826 Py_DECREF(unicode);
4827 return NULL;
4828 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 return (PyObject *)unicode;
4830
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 Py_XDECREF(errorHandler);
4834 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835 return NULL;
4836}
4837
Antoine Pitrouab868312009-01-10 15:40:25 +00004838#undef FAST_CHAR_MASK
4839#undef SWAPPED_FAST_CHAR_MASK
4840
Tim Peters772747b2001-08-09 22:21:55 +00004841PyObject *
4842PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 Py_ssize_t size,
4844 const char *errors,
4845 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004847 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004848 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004849 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004850#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004851 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004852#else
4853 const int pairs = 0;
4854#endif
Tim Peters772747b2001-08-09 22:21:55 +00004855 /* Offsets from p for storing byte pairs in the right order. */
4856#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4857 int ihi = 1, ilo = 0;
4858#else
4859 int ihi = 0, ilo = 1;
4860#endif
4861
Benjamin Peterson29060642009-01-31 22:14:21 +00004862#define STORECHAR(CH) \
4863 do { \
4864 p[ihi] = ((CH) >> 8) & 0xff; \
4865 p[ilo] = (CH) & 0xff; \
4866 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004867 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004869#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004870 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004871 if (s[i] >= 0x10000)
4872 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004873#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004874 /* 2 * (size + pairs + (byteorder == 0)) */
4875 if (size > PY_SSIZE_T_MAX ||
4876 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004877 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004878 nsize = size + pairs + (byteorder == 0);
4879 bytesize = nsize * 2;
4880 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004881 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004882 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 if (v == NULL)
4884 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004886 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004889 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004890 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004891
4892 if (byteorder == -1) {
4893 /* force LE */
4894 ihi = 1;
4895 ilo = 0;
4896 }
4897 else if (byteorder == 1) {
4898 /* force BE */
4899 ihi = 0;
4900 ilo = 1;
4901 }
4902
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004903 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004904 Py_UNICODE ch = *s++;
4905 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004906#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 if (ch >= 0x10000) {
4908 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4909 ch = 0xD800 | ((ch-0x10000) >> 10);
4910 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004911#endif
Tim Peters772747b2001-08-09 22:21:55 +00004912 STORECHAR(ch);
4913 if (ch2)
4914 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004915 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004916
4917 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004918 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004919#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920}
4921
Alexander Belopolsky40018472011-02-26 01:02:56 +00004922PyObject *
4923PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924{
4925 if (!PyUnicode_Check(unicode)) {
4926 PyErr_BadArgument();
4927 return NULL;
4928 }
4929 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004930 PyUnicode_GET_SIZE(unicode),
4931 NULL,
4932 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933}
4934
4935/* --- Unicode Escape Codec ----------------------------------------------- */
4936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004937/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4938 if all the escapes in the string make it still a valid ASCII string.
4939 Returns -1 if any escapes were found which cause the string to
4940 pop out of ASCII range. Otherwise returns the length of the
4941 required buffer to hold the string.
4942 */
4943Py_ssize_t
4944length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4945{
4946 const unsigned char *p = (const unsigned char *)s;
4947 const unsigned char *end = p + size;
4948 Py_ssize_t length = 0;
4949
4950 if (size < 0)
4951 return -1;
4952
4953 for (; p < end; ++p) {
4954 if (*p > 127) {
4955 /* Non-ASCII */
4956 return -1;
4957 }
4958 else if (*p != '\\') {
4959 /* Normal character */
4960 ++length;
4961 }
4962 else {
4963 /* Backslash-escape, check next char */
4964 ++p;
4965 /* Escape sequence reaches till end of string or
4966 non-ASCII follow-up. */
4967 if (p >= end || *p > 127)
4968 return -1;
4969 switch (*p) {
4970 case '\n':
4971 /* backslash + \n result in zero characters */
4972 break;
4973 case '\\': case '\'': case '\"':
4974 case 'b': case 'f': case 't':
4975 case 'n': case 'r': case 'v': case 'a':
4976 ++length;
4977 break;
4978 case '0': case '1': case '2': case '3':
4979 case '4': case '5': case '6': case '7':
4980 case 'x': case 'u': case 'U': case 'N':
4981 /* these do not guarantee ASCII characters */
4982 return -1;
4983 default:
4984 /* count the backslash + the other character */
4985 length += 2;
4986 }
4987 }
4988 }
4989 return length;
4990}
4991
4992/* Similar to PyUnicode_WRITE but either write into wstr field
4993 or treat string as ASCII. */
4994#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4995 do { \
4996 if ((kind) != PyUnicode_WCHAR_KIND) \
4997 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4998 else \
4999 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5000 } while (0)
5001
5002#define WRITE_WSTR(buf, index, value) \
5003 assert(kind == PyUnicode_WCHAR_KIND), \
5004 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5005
5006
Fredrik Lundh06d12682001-01-24 07:59:11 +00005007static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005008
Alexander Belopolsky40018472011-02-26 01:02:56 +00005009PyObject *
5010PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005011 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005012 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005014 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005015 Py_ssize_t startinpos;
5016 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005017 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005019 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005021 char* message;
5022 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005023 PyObject *errorHandler = NULL;
5024 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005025 Py_ssize_t ascii_length;
5026 Py_ssize_t i;
5027 int kind;
5028 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005030 ascii_length = length_of_escaped_ascii_string(s, size);
5031
5032 /* After length_of_escaped_ascii_string() there are two alternatives,
5033 either the string is pure ASCII with named escapes like \n, etc.
5034 and we determined it's exact size (common case)
5035 or it contains \x, \u, ... escape sequences. then we create a
5036 legacy wchar string and resize it at the end of this function. */
5037 if (ascii_length >= 0) {
5038 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5039 if (!v)
5040 goto onError;
5041 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5042 kind = PyUnicode_1BYTE_KIND;
5043 data = PyUnicode_DATA(v);
5044 }
5045 else {
5046 /* Escaped strings will always be longer than the resulting
5047 Unicode string, so we start with size here and then reduce the
5048 length after conversion to the true value.
5049 (but if the error callback returns a long replacement string
5050 we'll have to allocate more space) */
5051 v = _PyUnicode_New(size);
5052 if (!v)
5053 goto onError;
5054 kind = PyUnicode_WCHAR_KIND;
5055 data = PyUnicode_AS_UNICODE(v);
5056 }
5057
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058 if (size == 0)
5059 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005060 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005062
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063 while (s < end) {
5064 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005065 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005066 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005068 if (kind == PyUnicode_WCHAR_KIND) {
5069 assert(i < _PyUnicode_WSTR_LENGTH(v));
5070 }
5071 else {
5072 /* The only case in which i == ascii_length is a backslash
5073 followed by a newline. */
5074 assert(i <= ascii_length);
5075 }
5076
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077 /* Non-escape characters are interpreted as Unicode ordinals */
5078 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005079 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080 continue;
5081 }
5082
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005083 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084 /* \ - Escapes */
5085 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005086 c = *s++;
5087 if (s > end)
5088 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005089
5090 if (kind == PyUnicode_WCHAR_KIND) {
5091 assert(i < _PyUnicode_WSTR_LENGTH(v));
5092 }
5093 else {
5094 /* The only case in which i == ascii_length is a backslash
5095 followed by a newline. */
5096 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5097 }
5098
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005099 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005103 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5104 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5105 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5106 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5107 /* FF */
5108 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5109 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5110 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5111 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5112 /* VT */
5113 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5114 /* BEL, not classic C */
5115 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116
Benjamin Peterson29060642009-01-31 22:14:21 +00005117 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 case '0': case '1': case '2': case '3':
5119 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005120 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005121 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005122 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005123 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005124 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005126 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 break;
5128
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 /* hex escapes */
5130 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005132 digits = 2;
5133 message = "truncated \\xXX escape";
5134 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135
Benjamin Peterson29060642009-01-31 22:14:21 +00005136 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005138 digits = 4;
5139 message = "truncated \\uXXXX escape";
5140 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141
Benjamin Peterson29060642009-01-31 22:14:21 +00005142 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005143 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005144 digits = 8;
5145 message = "truncated \\UXXXXXXXX escape";
5146 hexescape:
5147 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005148 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149 if (s+digits>end) {
5150 endinpos = size;
5151 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 errors, &errorHandler,
5153 "unicodeescape", "end of string in escape sequence",
5154 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005155 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005156 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005157 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005158 goto nextByte;
5159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005160 for (j = 0; j < digits; ++j) {
5161 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005162 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005163 endinpos = (s+j+1)-starts;
5164 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005165 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 errors, &errorHandler,
5167 "unicodeescape", message,
5168 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005169 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005170 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005171 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005172 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005173 }
5174 chr = (chr<<4) & ~0xF;
5175 if (c >= '0' && c <= '9')
5176 chr += c - '0';
5177 else if (c >= 'a' && c <= 'f')
5178 chr += 10 + c - 'a';
5179 else
5180 chr += 10 + c - 'A';
5181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005182 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005183 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005184 /* _decoding_error will have already written into the
5185 target buffer. */
5186 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005187 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005188 /* when we get here, chr is a 32-bit unicode character */
5189 if (chr <= 0xffff)
5190 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005191 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005192 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005193 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005194 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005195#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005196 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005197#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005198 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005199 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5200 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005201#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005202 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005203 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005204 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 errors, &errorHandler,
5207 "unicodeescape", "illegal Unicode character",
5208 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005209 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005210 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005211 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005212 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005213 break;
5214
Benjamin Peterson29060642009-01-31 22:14:21 +00005215 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005216 case 'N':
5217 message = "malformed \\N character escape";
5218 if (ucnhash_CAPI == NULL) {
5219 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005220 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5221 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005222 if (ucnhash_CAPI == NULL)
5223 goto ucnhashError;
5224 }
5225 if (*s == '{') {
5226 const char *start = s+1;
5227 /* look for the closing brace */
5228 while (*s != '}' && s < end)
5229 s++;
5230 if (s > start && s < end && *s == '}') {
5231 /* found a name. look it up in the unicode database */
5232 message = "unknown Unicode character name";
5233 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005234 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5235 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005236 goto store;
5237 }
5238 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005239 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005240 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 errors, &errorHandler,
5243 "unicodeescape", message,
5244 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005245 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005246 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005247 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005248 break;
5249
5250 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005251 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005252 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005253 message = "\\ at end of string";
5254 s--;
5255 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005256 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005257 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 errors, &errorHandler,
5259 "unicodeescape", message,
5260 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005261 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005262 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005263 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005264 }
5265 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005266 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5267 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005268 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005269 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005271 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005272 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005274 /* Ensure the length prediction worked in case of ASCII strings */
5275 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5276
5277 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5278 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005279 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005280 Py_XDECREF(errorHandler);
5281 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005283
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005285 PyErr_SetString(
5286 PyExc_UnicodeError,
5287 "\\N escapes not supported (can't load unicodedata module)"
5288 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005289 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005290 Py_XDECREF(errorHandler);
5291 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005292 return NULL;
5293
Benjamin Peterson29060642009-01-31 22:14:21 +00005294 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005296 Py_XDECREF(errorHandler);
5297 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 return NULL;
5299}
5300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005301#undef WRITE_ASCII_OR_WSTR
5302#undef WRITE_WSTR
5303
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304/* Return a Unicode-Escape string version of the Unicode object.
5305
5306 If quotes is true, the string is enclosed in u"" or u'' quotes as
5307 appropriate.
5308
5309*/
5310
Walter Dörwald79e913e2007-05-12 11:08:06 +00005311static const char *hexdigits = "0123456789abcdef";
5312
Alexander Belopolsky40018472011-02-26 01:02:56 +00005313PyObject *
5314PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005315 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005317 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005320#ifdef Py_UNICODE_WIDE
5321 const Py_ssize_t expandsize = 10;
5322#else
5323 const Py_ssize_t expandsize = 6;
5324#endif
5325
Thomas Wouters89f507f2006-12-13 04:49:30 +00005326 /* XXX(nnorwitz): rather than over-allocating, it would be
5327 better to choose a different scheme. Perhaps scan the
5328 first N-chars of the string and allocate based on that size.
5329 */
5330 /* Initial allocation is based on the longest-possible unichr
5331 escape.
5332
5333 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5334 unichr, so in this case it's the longest unichr escape. In
5335 narrow (UTF-16) builds this is five chars per source unichr
5336 since there are two unichrs in the surrogate pair, so in narrow
5337 (UTF-16) builds it's not the longest unichr escape.
5338
5339 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5340 so in the narrow (UTF-16) build case it's the longest unichr
5341 escape.
5342 */
5343
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005344 if (size == 0)
5345 return PyBytes_FromStringAndSize(NULL, 0);
5346
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005347 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005348 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005349
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005350 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 2
5352 + expandsize*size
5353 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 if (repr == NULL)
5355 return NULL;
5356
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005357 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 while (size-- > 0) {
5360 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005361
Walter Dörwald79e913e2007-05-12 11:08:06 +00005362 /* Escape backslashes */
5363 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364 *p++ = '\\';
5365 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005366 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005367 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005368
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005369#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005370 /* Map 21-bit characters to '\U00xxxxxx' */
5371 else if (ch >= 0x10000) {
5372 *p++ = '\\';
5373 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005374 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5375 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5376 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5377 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5378 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5379 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5380 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5381 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005382 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005383 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005384#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5386 else if (ch >= 0xD800 && ch < 0xDC00) {
5387 Py_UNICODE ch2;
5388 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005389
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 ch2 = *s++;
5391 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005392 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5394 *p++ = '\\';
5395 *p++ = 'U';
5396 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5397 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5398 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5399 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5400 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5401 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5402 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5403 *p++ = hexdigits[ucs & 0x0000000F];
5404 continue;
5405 }
5406 /* Fall through: isolated surrogates are copied as-is */
5407 s--;
5408 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005409 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005410#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005411
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005413 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 *p++ = '\\';
5415 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005416 *p++ = hexdigits[(ch >> 12) & 0x000F];
5417 *p++ = hexdigits[(ch >> 8) & 0x000F];
5418 *p++ = hexdigits[(ch >> 4) & 0x000F];
5419 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005421
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005422 /* Map special whitespace to '\t', \n', '\r' */
5423 else if (ch == '\t') {
5424 *p++ = '\\';
5425 *p++ = 't';
5426 }
5427 else if (ch == '\n') {
5428 *p++ = '\\';
5429 *p++ = 'n';
5430 }
5431 else if (ch == '\r') {
5432 *p++ = '\\';
5433 *p++ = 'r';
5434 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005435
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005436 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005437 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005439 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005440 *p++ = hexdigits[(ch >> 4) & 0x000F];
5441 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005442 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005443
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 /* Copy everything else as-is */
5445 else
5446 *p++ = (char) ch;
5447 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005449 assert(p - PyBytes_AS_STRING(repr) > 0);
5450 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5451 return NULL;
5452 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453}
5454
Alexander Belopolsky40018472011-02-26 01:02:56 +00005455PyObject *
5456PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005458 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 if (!PyUnicode_Check(unicode)) {
5460 PyErr_BadArgument();
5461 return NULL;
5462 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005463 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5464 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005465 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466}
5467
5468/* --- Raw Unicode Escape Codec ------------------------------------------- */
5469
Alexander Belopolsky40018472011-02-26 01:02:56 +00005470PyObject *
5471PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005472 Py_ssize_t size,
5473 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005476 Py_ssize_t startinpos;
5477 Py_ssize_t endinpos;
5478 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005480 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 const char *end;
5482 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005483 PyObject *errorHandler = NULL;
5484 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005485
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 /* Escaped strings will always be longer than the resulting
5487 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 length after conversion to the true value. (But decoding error
5489 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 v = _PyUnicode_New(size);
5491 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005495 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 end = s + size;
5497 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 unsigned char c;
5499 Py_UCS4 x;
5500 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005501 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 /* Non-escape characters are interpreted as Unicode ordinals */
5504 if (*s != '\\') {
5505 *p++ = (unsigned char)*s++;
5506 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005507 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 startinpos = s-starts;
5509
5510 /* \u-escapes are only interpreted iff the number of leading
5511 backslashes if odd */
5512 bs = s;
5513 for (;s < end;) {
5514 if (*s != '\\')
5515 break;
5516 *p++ = (unsigned char)*s++;
5517 }
5518 if (((s - bs) & 1) == 0 ||
5519 s >= end ||
5520 (*s != 'u' && *s != 'U')) {
5521 continue;
5522 }
5523 p--;
5524 count = *s=='u' ? 4 : 8;
5525 s++;
5526
5527 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5528 outpos = p-PyUnicode_AS_UNICODE(v);
5529 for (x = 0, i = 0; i < count; ++i, ++s) {
5530 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005531 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005532 endinpos = s-starts;
5533 if (unicode_decode_call_errorhandler(
5534 errors, &errorHandler,
5535 "rawunicodeescape", "truncated \\uXXXX",
5536 &starts, &end, &startinpos, &endinpos, &exc, &s,
5537 &v, &outpos, &p))
5538 goto onError;
5539 goto nextByte;
5540 }
5541 x = (x<<4) & ~0xF;
5542 if (c >= '0' && c <= '9')
5543 x += c - '0';
5544 else if (c >= 'a' && c <= 'f')
5545 x += 10 + c - 'a';
5546 else
5547 x += 10 + c - 'A';
5548 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005549 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 /* UCS-2 character */
5551 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005552 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 /* UCS-4 character. Either store directly, or as
5554 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005555#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005557#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 x -= 0x10000L;
5559 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5560 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005561#endif
5562 } else {
5563 endinpos = s-starts;
5564 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005565 if (unicode_decode_call_errorhandler(
5566 errors, &errorHandler,
5567 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 &starts, &end, &startinpos, &endinpos, &exc, &s,
5569 &v, &outpos, &p))
5570 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005572 nextByte:
5573 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005575 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005577 Py_XDECREF(errorHandler);
5578 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005579 if (PyUnicode_READY(v) == -1) {
5580 Py_DECREF(v);
5581 return NULL;
5582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005584
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005587 Py_XDECREF(errorHandler);
5588 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 return NULL;
5590}
5591
Alexander Belopolsky40018472011-02-26 01:02:56 +00005592PyObject *
5593PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005594 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005596 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 char *p;
5598 char *q;
5599
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005600#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005601 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005602#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005603 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005604#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005605
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005606 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005608
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005609 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 if (repr == NULL)
5611 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005612 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005613 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005615 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 while (size-- > 0) {
5617 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005618#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005619 /* Map 32-bit characters to '\Uxxxxxxxx' */
5620 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005621 *p++ = '\\';
5622 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005623 *p++ = hexdigits[(ch >> 28) & 0xf];
5624 *p++ = hexdigits[(ch >> 24) & 0xf];
5625 *p++ = hexdigits[(ch >> 20) & 0xf];
5626 *p++ = hexdigits[(ch >> 16) & 0xf];
5627 *p++ = hexdigits[(ch >> 12) & 0xf];
5628 *p++ = hexdigits[(ch >> 8) & 0xf];
5629 *p++ = hexdigits[(ch >> 4) & 0xf];
5630 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005631 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005632 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005633#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5635 if (ch >= 0xD800 && ch < 0xDC00) {
5636 Py_UNICODE ch2;
5637 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005638
Benjamin Peterson29060642009-01-31 22:14:21 +00005639 ch2 = *s++;
5640 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005641 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5643 *p++ = '\\';
5644 *p++ = 'U';
5645 *p++ = hexdigits[(ucs >> 28) & 0xf];
5646 *p++ = hexdigits[(ucs >> 24) & 0xf];
5647 *p++ = hexdigits[(ucs >> 20) & 0xf];
5648 *p++ = hexdigits[(ucs >> 16) & 0xf];
5649 *p++ = hexdigits[(ucs >> 12) & 0xf];
5650 *p++ = hexdigits[(ucs >> 8) & 0xf];
5651 *p++ = hexdigits[(ucs >> 4) & 0xf];
5652 *p++ = hexdigits[ucs & 0xf];
5653 continue;
5654 }
5655 /* Fall through: isolated surrogates are copied as-is */
5656 s--;
5657 size++;
5658 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005659#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 /* Map 16-bit characters to '\uxxxx' */
5661 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 *p++ = '\\';
5663 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005664 *p++ = hexdigits[(ch >> 12) & 0xf];
5665 *p++ = hexdigits[(ch >> 8) & 0xf];
5666 *p++ = hexdigits[(ch >> 4) & 0xf];
5667 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 /* Copy everything else as-is */
5670 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 *p++ = (char) ch;
5672 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005673 size = p - q;
5674
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005675 assert(size > 0);
5676 if (_PyBytes_Resize(&repr, size) < 0)
5677 return NULL;
5678 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679}
5680
Alexander Belopolsky40018472011-02-26 01:02:56 +00005681PyObject *
5682PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005684 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005686 PyErr_BadArgument();
5687 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005689 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5690 PyUnicode_GET_SIZE(unicode));
5691
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005692 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693}
5694
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005695/* --- Unicode Internal Codec ------------------------------------------- */
5696
Alexander Belopolsky40018472011-02-26 01:02:56 +00005697PyObject *
5698_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005699 Py_ssize_t size,
5700 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005701{
5702 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005703 Py_ssize_t startinpos;
5704 Py_ssize_t endinpos;
5705 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005706 PyUnicodeObject *v;
5707 Py_UNICODE *p;
5708 const char *end;
5709 const char *reason;
5710 PyObject *errorHandler = NULL;
5711 PyObject *exc = NULL;
5712
Neal Norwitzd43069c2006-01-08 01:12:10 +00005713#ifdef Py_UNICODE_WIDE
5714 Py_UNICODE unimax = PyUnicode_GetMax();
5715#endif
5716
Thomas Wouters89f507f2006-12-13 04:49:30 +00005717 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005718 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5719 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005720 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005721 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5722 as string was created with the old API. */
5723 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005724 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005725 p = PyUnicode_AS_UNICODE(v);
5726 end = s + size;
5727
5728 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005729 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005730 /* We have to sanity check the raw data, otherwise doom looms for
5731 some malformed UCS-4 data. */
5732 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005733#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005734 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005735#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005736 end-s < Py_UNICODE_SIZE
5737 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005739 startinpos = s - starts;
5740 if (end-s < Py_UNICODE_SIZE) {
5741 endinpos = end-starts;
5742 reason = "truncated input";
5743 }
5744 else {
5745 endinpos = s - starts + Py_UNICODE_SIZE;
5746 reason = "illegal code point (> 0x10FFFF)";
5747 }
5748 outpos = p - PyUnicode_AS_UNICODE(v);
5749 if (unicode_decode_call_errorhandler(
5750 errors, &errorHandler,
5751 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005752 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005753 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005754 goto onError;
5755 }
5756 }
5757 else {
5758 p++;
5759 s += Py_UNICODE_SIZE;
5760 }
5761 }
5762
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005763 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005764 goto onError;
5765 Py_XDECREF(errorHandler);
5766 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005767 if (PyUnicode_READY(v) == -1) {
5768 Py_DECREF(v);
5769 return NULL;
5770 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005771 return (PyObject *)v;
5772
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005774 Py_XDECREF(v);
5775 Py_XDECREF(errorHandler);
5776 Py_XDECREF(exc);
5777 return NULL;
5778}
5779
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780/* --- Latin-1 Codec ------------------------------------------------------ */
5781
Alexander Belopolsky40018472011-02-26 01:02:56 +00005782PyObject *
5783PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005784 Py_ssize_t size,
5785 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005788 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789}
5790
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005792static void
5793make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005794 const char *encoding,
5795 const Py_UNICODE *unicode, Py_ssize_t size,
5796 Py_ssize_t startpos, Py_ssize_t endpos,
5797 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005799 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005800 *exceptionObject = PyUnicodeEncodeError_Create(
5801 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 }
5803 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005804 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5805 goto onError;
5806 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5807 goto onError;
5808 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5809 goto onError;
5810 return;
5811 onError:
5812 Py_DECREF(*exceptionObject);
5813 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814 }
5815}
5816
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005817/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005818static void
5819raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005820 const char *encoding,
5821 const Py_UNICODE *unicode, Py_ssize_t size,
5822 Py_ssize_t startpos, Py_ssize_t endpos,
5823 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824{
5825 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005827 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829}
5830
5831/* error handling callback helper:
5832 build arguments, call the callback and check the arguments,
5833 put the result into newpos and return the replacement string, which
5834 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005835static PyObject *
5836unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005837 PyObject **errorHandler,
5838 const char *encoding, const char *reason,
5839 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5840 Py_ssize_t startpos, Py_ssize_t endpos,
5841 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005842{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005843 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005844
5845 PyObject *restuple;
5846 PyObject *resunicode;
5847
5848 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852 }
5853
5854 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005856 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005858
5859 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005862 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005864 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 Py_DECREF(restuple);
5866 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005868 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 &resunicode, newpos)) {
5870 Py_DECREF(restuple);
5871 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005872 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005873 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5874 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5875 Py_DECREF(restuple);
5876 return NULL;
5877 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005878 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005879 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005880 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5882 Py_DECREF(restuple);
5883 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005884 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005885 Py_INCREF(resunicode);
5886 Py_DECREF(restuple);
5887 return resunicode;
5888}
5889
Alexander Belopolsky40018472011-02-26 01:02:56 +00005890static PyObject *
5891unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005892 Py_ssize_t size,
5893 const char *errors,
5894 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895{
5896 /* output object */
5897 PyObject *res;
5898 /* pointers to the beginning and end+1 of input */
5899 const Py_UNICODE *startp = p;
5900 const Py_UNICODE *endp = p + size;
5901 /* pointer to the beginning of the unencodable characters */
5902 /* const Py_UNICODE *badp = NULL; */
5903 /* pointer into the output */
5904 char *str;
5905 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005906 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005907 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5908 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 PyObject *errorHandler = NULL;
5910 PyObject *exc = NULL;
5911 /* the following variable is used for caching string comparisons
5912 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5913 int known_errorHandler = -1;
5914
5915 /* allocate enough for a simple encoding without
5916 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005917 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005918 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005919 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005920 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005921 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005922 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005923 ressize = size;
5924
5925 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 /* can we encode this? */
5929 if (c<limit) {
5930 /* no overflow check, because we know that the space is enough */
5931 *str++ = (char)c;
5932 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005933 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 else {
5935 Py_ssize_t unicodepos = p-startp;
5936 Py_ssize_t requiredsize;
5937 PyObject *repunicode;
5938 Py_ssize_t repsize;
5939 Py_ssize_t newpos;
5940 Py_ssize_t respos;
5941 Py_UNICODE *uni2;
5942 /* startpos for collecting unencodable chars */
5943 const Py_UNICODE *collstart = p;
5944 const Py_UNICODE *collend = p;
5945 /* find all unecodable characters */
5946 while ((collend < endp) && ((*collend)>=limit))
5947 ++collend;
5948 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5949 if (known_errorHandler==-1) {
5950 if ((errors==NULL) || (!strcmp(errors, "strict")))
5951 known_errorHandler = 1;
5952 else if (!strcmp(errors, "replace"))
5953 known_errorHandler = 2;
5954 else if (!strcmp(errors, "ignore"))
5955 known_errorHandler = 3;
5956 else if (!strcmp(errors, "xmlcharrefreplace"))
5957 known_errorHandler = 4;
5958 else
5959 known_errorHandler = 0;
5960 }
5961 switch (known_errorHandler) {
5962 case 1: /* strict */
5963 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5964 goto onError;
5965 case 2: /* replace */
5966 while (collstart++<collend)
5967 *str++ = '?'; /* fall through */
5968 case 3: /* ignore */
5969 p = collend;
5970 break;
5971 case 4: /* xmlcharrefreplace */
5972 respos = str - PyBytes_AS_STRING(res);
5973 /* determine replacement size (temporarily (mis)uses p) */
5974 for (p = collstart, repsize = 0; p < collend; ++p) {
5975 if (*p<10)
5976 repsize += 2+1+1;
5977 else if (*p<100)
5978 repsize += 2+2+1;
5979 else if (*p<1000)
5980 repsize += 2+3+1;
5981 else if (*p<10000)
5982 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005983#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 else
5985 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005986#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 else if (*p<100000)
5988 repsize += 2+5+1;
5989 else if (*p<1000000)
5990 repsize += 2+6+1;
5991 else
5992 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005993#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 }
5995 requiredsize = respos+repsize+(endp-collend);
5996 if (requiredsize > ressize) {
5997 if (requiredsize<2*ressize)
5998 requiredsize = 2*ressize;
5999 if (_PyBytes_Resize(&res, requiredsize))
6000 goto onError;
6001 str = PyBytes_AS_STRING(res) + respos;
6002 ressize = requiredsize;
6003 }
6004 /* generate replacement (temporarily (mis)uses p) */
6005 for (p = collstart; p < collend; ++p) {
6006 str += sprintf(str, "&#%d;", (int)*p);
6007 }
6008 p = collend;
6009 break;
6010 default:
6011 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6012 encoding, reason, startp, size, &exc,
6013 collstart-startp, collend-startp, &newpos);
6014 if (repunicode == NULL)
6015 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006016 if (PyBytes_Check(repunicode)) {
6017 /* Directly copy bytes result to output. */
6018 repsize = PyBytes_Size(repunicode);
6019 if (repsize > 1) {
6020 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006021 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006022 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6023 Py_DECREF(repunicode);
6024 goto onError;
6025 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006026 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006027 ressize += repsize-1;
6028 }
6029 memcpy(str, PyBytes_AsString(repunicode), repsize);
6030 str += repsize;
6031 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006032 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006033 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006034 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 /* need more space? (at least enough for what we
6036 have+the replacement+the rest of the string, so
6037 we won't have to check space for encodable characters) */
6038 respos = str - PyBytes_AS_STRING(res);
6039 repsize = PyUnicode_GET_SIZE(repunicode);
6040 requiredsize = respos+repsize+(endp-collend);
6041 if (requiredsize > ressize) {
6042 if (requiredsize<2*ressize)
6043 requiredsize = 2*ressize;
6044 if (_PyBytes_Resize(&res, requiredsize)) {
6045 Py_DECREF(repunicode);
6046 goto onError;
6047 }
6048 str = PyBytes_AS_STRING(res) + respos;
6049 ressize = requiredsize;
6050 }
6051 /* check if there is anything unencodable in the replacement
6052 and copy it to the output */
6053 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6054 c = *uni2;
6055 if (c >= limit) {
6056 raise_encode_exception(&exc, encoding, startp, size,
6057 unicodepos, unicodepos+1, reason);
6058 Py_DECREF(repunicode);
6059 goto onError;
6060 }
6061 *str = (char)c;
6062 }
6063 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006064 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006065 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006066 }
6067 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006068 /* Resize if we allocated to much */
6069 size = str - PyBytes_AS_STRING(res);
6070 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006071 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006072 if (_PyBytes_Resize(&res, size) < 0)
6073 goto onError;
6074 }
6075
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006076 Py_XDECREF(errorHandler);
6077 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006078 return res;
6079
6080 onError:
6081 Py_XDECREF(res);
6082 Py_XDECREF(errorHandler);
6083 Py_XDECREF(exc);
6084 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006085}
6086
Alexander Belopolsky40018472011-02-26 01:02:56 +00006087PyObject *
6088PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006089 Py_ssize_t size,
6090 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093}
6094
Alexander Belopolsky40018472011-02-26 01:02:56 +00006095PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006096_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097{
6098 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 PyErr_BadArgument();
6100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006102 if (PyUnicode_READY(unicode) == -1)
6103 return NULL;
6104 /* Fast path: if it is a one-byte string, construct
6105 bytes object directly. */
6106 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6107 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6108 PyUnicode_GET_LENGTH(unicode));
6109 /* Non-Latin-1 characters present. Defer to above function to
6110 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006113 errors);
6114}
6115
6116PyObject*
6117PyUnicode_AsLatin1String(PyObject *unicode)
6118{
6119 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120}
6121
6122/* --- 7-bit ASCII Codec -------------------------------------------------- */
6123
Alexander Belopolsky40018472011-02-26 01:02:56 +00006124PyObject *
6125PyUnicode_DecodeASCII(const char *s,
6126 Py_ssize_t size,
6127 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 PyUnicodeObject *v;
6131 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006132 Py_ssize_t startinpos;
6133 Py_ssize_t endinpos;
6134 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006135 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006136 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006137 PyObject *errorHandler = NULL;
6138 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006139 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006140
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006142 if (size == 1 && *(unsigned char*)s < 128)
6143 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6144
6145 /* Fast path. Assume the input actually *is* ASCII, and allocate
6146 a single-block Unicode object with that assumption. If there is
6147 an error, drop the object and start over. */
6148 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6149 if (v == NULL)
6150 goto onError;
6151 d = PyUnicode_1BYTE_DATA(v);
6152 for (i = 0; i < size; i++) {
6153 unsigned char ch = ((unsigned char*)s)[i];
6154 if (ch < 128)
6155 d[i] = ch;
6156 else
6157 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006159 if (i == size)
6160 return (PyObject*)v;
6161 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006162
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 v = _PyUnicode_New(size);
6164 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006165 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006169 e = s + size;
6170 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 register unsigned char c = (unsigned char)*s;
6172 if (c < 128) {
6173 *p++ = c;
6174 ++s;
6175 }
6176 else {
6177 startinpos = s-starts;
6178 endinpos = startinpos + 1;
6179 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6180 if (unicode_decode_call_errorhandler(
6181 errors, &errorHandler,
6182 "ascii", "ordinal not in range(128)",
6183 &starts, &e, &startinpos, &endinpos, &exc, &s,
6184 &v, &outpos, &p))
6185 goto onError;
6186 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006188 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6190 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006191 Py_XDECREF(errorHandler);
6192 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006193 if (PyUnicode_READY(v) == -1) {
6194 Py_DECREF(v);
6195 return NULL;
6196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006198
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006201 Py_XDECREF(errorHandler);
6202 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 return NULL;
6204}
6205
Alexander Belopolsky40018472011-02-26 01:02:56 +00006206PyObject *
6207PyUnicode_EncodeASCII(const Py_UNICODE *p,
6208 Py_ssize_t size,
6209 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006211 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212}
6213
Alexander Belopolsky40018472011-02-26 01:02:56 +00006214PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006215_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216{
6217 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 PyErr_BadArgument();
6219 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006221 if (PyUnicode_READY(unicode) == -1)
6222 return NULL;
6223 /* Fast path: if it is an ASCII-only string, construct bytes object
6224 directly. Else defer to above function to raise the exception. */
6225 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6226 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6227 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006230 errors);
6231}
6232
6233PyObject *
6234PyUnicode_AsASCIIString(PyObject *unicode)
6235{
6236 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237}
6238
Victor Stinner99b95382011-07-04 14:23:54 +02006239#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006240
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006241/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006242
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006243#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006244#define NEED_RETRY
6245#endif
6246
6247/* XXX This code is limited to "true" double-byte encodings, as
6248 a) it assumes an incomplete character consists of a single byte, and
6249 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006251
Alexander Belopolsky40018472011-02-26 01:02:56 +00006252static int
6253is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006254{
6255 const char *curr = s + offset;
6256
6257 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 const char *prev = CharPrev(s, curr);
6259 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006260 }
6261 return 0;
6262}
6263
6264/*
6265 * Decode MBCS string into unicode object. If 'final' is set, converts
6266 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6267 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006268static int
6269decode_mbcs(PyUnicodeObject **v,
6270 const char *s, /* MBCS string */
6271 int size, /* sizeof MBCS string */
6272 int final,
6273 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006274{
6275 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006276 Py_ssize_t n;
6277 DWORD usize;
6278 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006279
6280 assert(size >= 0);
6281
Victor Stinner554f3f02010-06-16 23:33:54 +00006282 /* check and handle 'errors' arg */
6283 if (errors==NULL || strcmp(errors, "strict")==0)
6284 flags = MB_ERR_INVALID_CHARS;
6285 else if (strcmp(errors, "ignore")==0)
6286 flags = 0;
6287 else {
6288 PyErr_Format(PyExc_ValueError,
6289 "mbcs encoding does not support errors='%s'",
6290 errors);
6291 return -1;
6292 }
6293
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006294 /* Skip trailing lead-byte unless 'final' is set */
6295 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006297
6298 /* First get the size of the result */
6299 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006300 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6301 if (usize==0)
6302 goto mbcs_decode_error;
6303 } else
6304 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006305
6306 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 /* Create unicode object */
6308 *v = _PyUnicode_New(usize);
6309 if (*v == NULL)
6310 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006311 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006312 }
6313 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 /* Extend unicode object */
6315 n = PyUnicode_GET_SIZE(*v);
6316 if (_PyUnicode_Resize(v, n + usize) < 0)
6317 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006318 }
6319
6320 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006321 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006323 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6324 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006326 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006327 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006328
6329mbcs_decode_error:
6330 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6331 we raise a UnicodeDecodeError - else it is a 'generic'
6332 windows error
6333 */
6334 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6335 /* Ideally, we should get reason from FormatMessage - this
6336 is the Windows 2000 English version of the message
6337 */
6338 PyObject *exc = NULL;
6339 const char *reason = "No mapping for the Unicode character exists "
6340 "in the target multi-byte code page.";
6341 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6342 if (exc != NULL) {
6343 PyCodec_StrictErrors(exc);
6344 Py_DECREF(exc);
6345 }
6346 } else {
6347 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6348 }
6349 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006350}
6351
Alexander Belopolsky40018472011-02-26 01:02:56 +00006352PyObject *
6353PyUnicode_DecodeMBCSStateful(const char *s,
6354 Py_ssize_t size,
6355 const char *errors,
6356 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006357{
6358 PyUnicodeObject *v = NULL;
6359 int done;
6360
6361 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006362 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006363
6364#ifdef NEED_RETRY
6365 retry:
6366 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006367 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006368 else
6369#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006370 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006371
6372 if (done < 0) {
6373 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006375 }
6376
6377 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006379
6380#ifdef NEED_RETRY
6381 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 s += done;
6383 size -= done;
6384 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006385 }
6386#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006387 if (PyUnicode_READY(v) == -1) {
6388 Py_DECREF(v);
6389 return NULL;
6390 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006391 return (PyObject *)v;
6392}
6393
Alexander Belopolsky40018472011-02-26 01:02:56 +00006394PyObject *
6395PyUnicode_DecodeMBCS(const char *s,
6396 Py_ssize_t size,
6397 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006398{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006399 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6400}
6401
6402/*
6403 * Convert unicode into string object (MBCS).
6404 * Returns 0 if succeed, -1 otherwise.
6405 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006406static int
6407encode_mbcs(PyObject **repr,
6408 const Py_UNICODE *p, /* unicode */
6409 int size, /* size of unicode */
6410 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006411{
Victor Stinner554f3f02010-06-16 23:33:54 +00006412 BOOL usedDefaultChar = FALSE;
6413 BOOL *pusedDefaultChar;
6414 int mbcssize;
6415 Py_ssize_t n;
6416 PyObject *exc = NULL;
6417 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006418
6419 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006420
Victor Stinner554f3f02010-06-16 23:33:54 +00006421 /* check and handle 'errors' arg */
6422 if (errors==NULL || strcmp(errors, "strict")==0) {
6423 flags = WC_NO_BEST_FIT_CHARS;
6424 pusedDefaultChar = &usedDefaultChar;
6425 } else if (strcmp(errors, "replace")==0) {
6426 flags = 0;
6427 pusedDefaultChar = NULL;
6428 } else {
6429 PyErr_Format(PyExc_ValueError,
6430 "mbcs encoding does not support errors='%s'",
6431 errors);
6432 return -1;
6433 }
6434
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006435 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006436 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006437 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6438 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 if (mbcssize == 0) {
6440 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6441 return -1;
6442 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006443 /* If we used a default char, then we failed! */
6444 if (pusedDefaultChar && *pusedDefaultChar)
6445 goto mbcs_encode_error;
6446 } else {
6447 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006448 }
6449
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006450 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 /* Create string object */
6452 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6453 if (*repr == NULL)
6454 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006455 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006456 }
6457 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 /* Extend string object */
6459 n = PyBytes_Size(*repr);
6460 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6461 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006462 }
6463
6464 /* Do the conversion */
6465 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006467 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6468 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6470 return -1;
6471 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006472 if (pusedDefaultChar && *pusedDefaultChar)
6473 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006474 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006475 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006476
6477mbcs_encode_error:
6478 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6479 Py_XDECREF(exc);
6480 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006481}
6482
Alexander Belopolsky40018472011-02-26 01:02:56 +00006483PyObject *
6484PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6485 Py_ssize_t size,
6486 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006487{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006488 PyObject *repr = NULL;
6489 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006490
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006491#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006493 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006494 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006495 else
6496#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006497 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006498
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006499 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 Py_XDECREF(repr);
6501 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006502 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006503
6504#ifdef NEED_RETRY
6505 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 p += INT_MAX;
6507 size -= INT_MAX;
6508 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006509 }
6510#endif
6511
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006512 return repr;
6513}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006514
Alexander Belopolsky40018472011-02-26 01:02:56 +00006515PyObject *
6516PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006517{
6518 if (!PyUnicode_Check(unicode)) {
6519 PyErr_BadArgument();
6520 return NULL;
6521 }
6522 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 PyUnicode_GET_SIZE(unicode),
6524 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006525}
6526
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006527#undef NEED_RETRY
6528
Victor Stinner99b95382011-07-04 14:23:54 +02006529#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006530
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531/* --- Character Mapping Codec -------------------------------------------- */
6532
Alexander Belopolsky40018472011-02-26 01:02:56 +00006533PyObject *
6534PyUnicode_DecodeCharmap(const char *s,
6535 Py_ssize_t size,
6536 PyObject *mapping,
6537 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006539 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006540 Py_ssize_t startinpos;
6541 Py_ssize_t endinpos;
6542 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006543 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 PyUnicodeObject *v;
6545 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006546 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006547 PyObject *errorHandler = NULL;
6548 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006549 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006550 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006551
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 /* Default to Latin-1 */
6553 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555
6556 v = _PyUnicode_New(size);
6557 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006563 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 mapstring = PyUnicode_AS_UNICODE(mapping);
6565 maplen = PyUnicode_GET_SIZE(mapping);
6566 while (s < e) {
6567 unsigned char ch = *s;
6568 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 if (ch < maplen)
6571 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 if (x == 0xfffe) {
6574 /* undefined mapping */
6575 outpos = p-PyUnicode_AS_UNICODE(v);
6576 startinpos = s-starts;
6577 endinpos = startinpos+1;
6578 if (unicode_decode_call_errorhandler(
6579 errors, &errorHandler,
6580 "charmap", "character maps to <undefined>",
6581 &starts, &e, &startinpos, &endinpos, &exc, &s,
6582 &v, &outpos, &p)) {
6583 goto onError;
6584 }
6585 continue;
6586 }
6587 *p++ = x;
6588 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006589 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006590 }
6591 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 while (s < e) {
6593 unsigned char ch = *s;
6594 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006595
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6597 w = PyLong_FromLong((long)ch);
6598 if (w == NULL)
6599 goto onError;
6600 x = PyObject_GetItem(mapping, w);
6601 Py_DECREF(w);
6602 if (x == NULL) {
6603 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6604 /* No mapping found means: mapping is undefined. */
6605 PyErr_Clear();
6606 x = Py_None;
6607 Py_INCREF(x);
6608 } else
6609 goto onError;
6610 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006611
Benjamin Peterson29060642009-01-31 22:14:21 +00006612 /* Apply mapping */
6613 if (PyLong_Check(x)) {
6614 long value = PyLong_AS_LONG(x);
6615 if (value < 0 || value > 65535) {
6616 PyErr_SetString(PyExc_TypeError,
6617 "character mapping must be in range(65536)");
6618 Py_DECREF(x);
6619 goto onError;
6620 }
6621 *p++ = (Py_UNICODE)value;
6622 }
6623 else if (x == Py_None) {
6624 /* undefined mapping */
6625 outpos = p-PyUnicode_AS_UNICODE(v);
6626 startinpos = s-starts;
6627 endinpos = startinpos+1;
6628 if (unicode_decode_call_errorhandler(
6629 errors, &errorHandler,
6630 "charmap", "character maps to <undefined>",
6631 &starts, &e, &startinpos, &endinpos, &exc, &s,
6632 &v, &outpos, &p)) {
6633 Py_DECREF(x);
6634 goto onError;
6635 }
6636 Py_DECREF(x);
6637 continue;
6638 }
6639 else if (PyUnicode_Check(x)) {
6640 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006641
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 if (targetsize == 1)
6643 /* 1-1 mapping */
6644 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006645
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 else if (targetsize > 1) {
6647 /* 1-n mapping */
6648 if (targetsize > extrachars) {
6649 /* resize first */
6650 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6651 Py_ssize_t needed = (targetsize - extrachars) + \
6652 (targetsize << 2);
6653 extrachars += needed;
6654 /* XXX overflow detection missing */
6655 if (_PyUnicode_Resize(&v,
6656 PyUnicode_GET_SIZE(v) + needed) < 0) {
6657 Py_DECREF(x);
6658 goto onError;
6659 }
6660 p = PyUnicode_AS_UNICODE(v) + oldpos;
6661 }
6662 Py_UNICODE_COPY(p,
6663 PyUnicode_AS_UNICODE(x),
6664 targetsize);
6665 p += targetsize;
6666 extrachars -= targetsize;
6667 }
6668 /* 1-0 mapping: skip the character */
6669 }
6670 else {
6671 /* wrong return value */
6672 PyErr_SetString(PyExc_TypeError,
6673 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006674 Py_DECREF(x);
6675 goto onError;
6676 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 Py_DECREF(x);
6678 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 }
6681 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6683 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 Py_XDECREF(errorHandler);
6685 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006686 if (PyUnicode_READY(v) == -1) {
6687 Py_DECREF(v);
6688 return NULL;
6689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006691
Benjamin Peterson29060642009-01-31 22:14:21 +00006692 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006693 Py_XDECREF(errorHandler);
6694 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 Py_XDECREF(v);
6696 return NULL;
6697}
6698
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006699/* Charmap encoding: the lookup table */
6700
Alexander Belopolsky40018472011-02-26 01:02:56 +00006701struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006702 PyObject_HEAD
6703 unsigned char level1[32];
6704 int count2, count3;
6705 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006706};
6707
6708static PyObject*
6709encoding_map_size(PyObject *obj, PyObject* args)
6710{
6711 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006712 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006714}
6715
6716static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006717 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 PyDoc_STR("Return the size (in bytes) of this object") },
6719 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006720};
6721
6722static void
6723encoding_map_dealloc(PyObject* o)
6724{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006725 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006726}
6727
6728static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006729 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 "EncodingMap", /*tp_name*/
6731 sizeof(struct encoding_map), /*tp_basicsize*/
6732 0, /*tp_itemsize*/
6733 /* methods */
6734 encoding_map_dealloc, /*tp_dealloc*/
6735 0, /*tp_print*/
6736 0, /*tp_getattr*/
6737 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006738 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 0, /*tp_repr*/
6740 0, /*tp_as_number*/
6741 0, /*tp_as_sequence*/
6742 0, /*tp_as_mapping*/
6743 0, /*tp_hash*/
6744 0, /*tp_call*/
6745 0, /*tp_str*/
6746 0, /*tp_getattro*/
6747 0, /*tp_setattro*/
6748 0, /*tp_as_buffer*/
6749 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6750 0, /*tp_doc*/
6751 0, /*tp_traverse*/
6752 0, /*tp_clear*/
6753 0, /*tp_richcompare*/
6754 0, /*tp_weaklistoffset*/
6755 0, /*tp_iter*/
6756 0, /*tp_iternext*/
6757 encoding_map_methods, /*tp_methods*/
6758 0, /*tp_members*/
6759 0, /*tp_getset*/
6760 0, /*tp_base*/
6761 0, /*tp_dict*/
6762 0, /*tp_descr_get*/
6763 0, /*tp_descr_set*/
6764 0, /*tp_dictoffset*/
6765 0, /*tp_init*/
6766 0, /*tp_alloc*/
6767 0, /*tp_new*/
6768 0, /*tp_free*/
6769 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006770};
6771
6772PyObject*
6773PyUnicode_BuildEncodingMap(PyObject* string)
6774{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006775 PyObject *result;
6776 struct encoding_map *mresult;
6777 int i;
6778 int need_dict = 0;
6779 unsigned char level1[32];
6780 unsigned char level2[512];
6781 unsigned char *mlevel1, *mlevel2, *mlevel3;
6782 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006783 int kind;
6784 void *data;
6785 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006787 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006788 PyErr_BadArgument();
6789 return NULL;
6790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006791 kind = PyUnicode_KIND(string);
6792 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006793 memset(level1, 0xFF, sizeof level1);
6794 memset(level2, 0xFF, sizeof level2);
6795
6796 /* If there isn't a one-to-one mapping of NULL to \0,
6797 or if there are non-BMP characters, we need to use
6798 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006799 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006800 need_dict = 1;
6801 for (i = 1; i < 256; i++) {
6802 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006803 ch = PyUnicode_READ(kind, data, i);
6804 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006805 need_dict = 1;
6806 break;
6807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006808 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006809 /* unmapped character */
6810 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006811 l1 = ch >> 11;
6812 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006813 if (level1[l1] == 0xFF)
6814 level1[l1] = count2++;
6815 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006816 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006817 }
6818
6819 if (count2 >= 0xFF || count3 >= 0xFF)
6820 need_dict = 1;
6821
6822 if (need_dict) {
6823 PyObject *result = PyDict_New();
6824 PyObject *key, *value;
6825 if (!result)
6826 return NULL;
6827 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006828 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006829 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006830 if (!key || !value)
6831 goto failed1;
6832 if (PyDict_SetItem(result, key, value) == -1)
6833 goto failed1;
6834 Py_DECREF(key);
6835 Py_DECREF(value);
6836 }
6837 return result;
6838 failed1:
6839 Py_XDECREF(key);
6840 Py_XDECREF(value);
6841 Py_DECREF(result);
6842 return NULL;
6843 }
6844
6845 /* Create a three-level trie */
6846 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6847 16*count2 + 128*count3 - 1);
6848 if (!result)
6849 return PyErr_NoMemory();
6850 PyObject_Init(result, &EncodingMapType);
6851 mresult = (struct encoding_map*)result;
6852 mresult->count2 = count2;
6853 mresult->count3 = count3;
6854 mlevel1 = mresult->level1;
6855 mlevel2 = mresult->level23;
6856 mlevel3 = mresult->level23 + 16*count2;
6857 memcpy(mlevel1, level1, 32);
6858 memset(mlevel2, 0xFF, 16*count2);
6859 memset(mlevel3, 0, 128*count3);
6860 count3 = 0;
6861 for (i = 1; i < 256; i++) {
6862 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006863 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006864 /* unmapped character */
6865 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006866 o1 = PyUnicode_READ(kind, data, i)>>11;
6867 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006868 i2 = 16*mlevel1[o1] + o2;
6869 if (mlevel2[i2] == 0xFF)
6870 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006871 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006872 i3 = 128*mlevel2[i2] + o3;
6873 mlevel3[i3] = i;
6874 }
6875 return result;
6876}
6877
6878static int
6879encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6880{
6881 struct encoding_map *map = (struct encoding_map*)mapping;
6882 int l1 = c>>11;
6883 int l2 = (c>>7) & 0xF;
6884 int l3 = c & 0x7F;
6885 int i;
6886
6887#ifdef Py_UNICODE_WIDE
6888 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006890 }
6891#endif
6892 if (c == 0)
6893 return 0;
6894 /* level 1*/
6895 i = map->level1[l1];
6896 if (i == 0xFF) {
6897 return -1;
6898 }
6899 /* level 2*/
6900 i = map->level23[16*i+l2];
6901 if (i == 0xFF) {
6902 return -1;
6903 }
6904 /* level 3 */
6905 i = map->level23[16*map->count2 + 128*i + l3];
6906 if (i == 0) {
6907 return -1;
6908 }
6909 return i;
6910}
6911
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006912/* Lookup the character ch in the mapping. If the character
6913 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006914 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006915static PyObject *
6916charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917{
Christian Heimes217cfd12007-12-02 14:31:20 +00006918 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919 PyObject *x;
6920
6921 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923 x = PyObject_GetItem(mapping, w);
6924 Py_DECREF(w);
6925 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6927 /* No mapping found means: mapping is undefined. */
6928 PyErr_Clear();
6929 x = Py_None;
6930 Py_INCREF(x);
6931 return x;
6932 } else
6933 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006935 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006937 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 long value = PyLong_AS_LONG(x);
6939 if (value < 0 || value > 255) {
6940 PyErr_SetString(PyExc_TypeError,
6941 "character mapping must be in range(256)");
6942 Py_DECREF(x);
6943 return NULL;
6944 }
6945 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006947 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 /* wrong return value */
6951 PyErr_Format(PyExc_TypeError,
6952 "character mapping must return integer, bytes or None, not %.400s",
6953 x->ob_type->tp_name);
6954 Py_DECREF(x);
6955 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 }
6957}
6958
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006959static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006960charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006961{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006962 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6963 /* exponentially overallocate to minimize reallocations */
6964 if (requiredsize < 2*outsize)
6965 requiredsize = 2*outsize;
6966 if (_PyBytes_Resize(outobj, requiredsize))
6967 return -1;
6968 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006969}
6970
Benjamin Peterson14339b62009-01-31 16:36:08 +00006971typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006973} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006974/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006975 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006976 space is available. Return a new reference to the object that
6977 was put in the output buffer, or Py_None, if the mapping was undefined
6978 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006979 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006980static charmapencode_result
6981charmapencode_output(Py_UNICODE c, PyObject *mapping,
6982 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006983{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006984 PyObject *rep;
6985 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006986 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006987
Christian Heimes90aa7642007-12-19 02:45:37 +00006988 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006989 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006990 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006991 if (res == -1)
6992 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 if (outsize<requiredsize)
6994 if (charmapencode_resize(outobj, outpos, requiredsize))
6995 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006996 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 outstart[(*outpos)++] = (char)res;
6998 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006999 }
7000
7001 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007002 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007004 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 Py_DECREF(rep);
7006 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007007 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 if (PyLong_Check(rep)) {
7009 Py_ssize_t requiredsize = *outpos+1;
7010 if (outsize<requiredsize)
7011 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7012 Py_DECREF(rep);
7013 return enc_EXCEPTION;
7014 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007015 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007017 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 else {
7019 const char *repchars = PyBytes_AS_STRING(rep);
7020 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7021 Py_ssize_t requiredsize = *outpos+repsize;
7022 if (outsize<requiredsize)
7023 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7024 Py_DECREF(rep);
7025 return enc_EXCEPTION;
7026 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007027 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 memcpy(outstart + *outpos, repchars, repsize);
7029 *outpos += repsize;
7030 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007031 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007032 Py_DECREF(rep);
7033 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007034}
7035
7036/* handle an error in PyUnicode_EncodeCharmap
7037 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007038static int
7039charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007040 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007041 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007042 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007043 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007044{
7045 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007046 Py_ssize_t repsize;
7047 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007048 Py_UNICODE *uni2;
7049 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007050 Py_ssize_t collstartpos = *inpos;
7051 Py_ssize_t collendpos = *inpos+1;
7052 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007053 char *encoding = "charmap";
7054 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007055 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007056
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007057 /* find all unencodable characters */
7058 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007059 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007060 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007061 int res = encoding_map_lookup(p[collendpos], mapping);
7062 if (res != -1)
7063 break;
7064 ++collendpos;
7065 continue;
7066 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007067
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 rep = charmapencode_lookup(p[collendpos], mapping);
7069 if (rep==NULL)
7070 return -1;
7071 else if (rep!=Py_None) {
7072 Py_DECREF(rep);
7073 break;
7074 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007075 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007076 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007077 }
7078 /* cache callback name lookup
7079 * (if not done yet, i.e. it's the first error) */
7080 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 if ((errors==NULL) || (!strcmp(errors, "strict")))
7082 *known_errorHandler = 1;
7083 else if (!strcmp(errors, "replace"))
7084 *known_errorHandler = 2;
7085 else if (!strcmp(errors, "ignore"))
7086 *known_errorHandler = 3;
7087 else if (!strcmp(errors, "xmlcharrefreplace"))
7088 *known_errorHandler = 4;
7089 else
7090 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007091 }
7092 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007093 case 1: /* strict */
7094 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7095 return -1;
7096 case 2: /* replace */
7097 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007098 x = charmapencode_output('?', mapping, res, respos);
7099 if (x==enc_EXCEPTION) {
7100 return -1;
7101 }
7102 else if (x==enc_FAILED) {
7103 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7104 return -1;
7105 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007106 }
7107 /* fall through */
7108 case 3: /* ignore */
7109 *inpos = collendpos;
7110 break;
7111 case 4: /* xmlcharrefreplace */
7112 /* generate replacement (temporarily (mis)uses p) */
7113 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 char buffer[2+29+1+1];
7115 char *cp;
7116 sprintf(buffer, "&#%d;", (int)p[collpos]);
7117 for (cp = buffer; *cp; ++cp) {
7118 x = charmapencode_output(*cp, mapping, res, respos);
7119 if (x==enc_EXCEPTION)
7120 return -1;
7121 else if (x==enc_FAILED) {
7122 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7123 return -1;
7124 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007125 }
7126 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007127 *inpos = collendpos;
7128 break;
7129 default:
7130 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 encoding, reason, p, size, exceptionObject,
7132 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007133 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007135 if (PyBytes_Check(repunicode)) {
7136 /* Directly copy bytes result to output. */
7137 Py_ssize_t outsize = PyBytes_Size(*res);
7138 Py_ssize_t requiredsize;
7139 repsize = PyBytes_Size(repunicode);
7140 requiredsize = *respos + repsize;
7141 if (requiredsize > outsize)
7142 /* Make room for all additional bytes. */
7143 if (charmapencode_resize(res, respos, requiredsize)) {
7144 Py_DECREF(repunicode);
7145 return -1;
7146 }
7147 memcpy(PyBytes_AsString(*res) + *respos,
7148 PyBytes_AsString(repunicode), repsize);
7149 *respos += repsize;
7150 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007151 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007152 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007153 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007154 /* generate replacement */
7155 repsize = PyUnicode_GET_SIZE(repunicode);
7156 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007157 x = charmapencode_output(*uni2, mapping, res, respos);
7158 if (x==enc_EXCEPTION) {
7159 return -1;
7160 }
7161 else if (x==enc_FAILED) {
7162 Py_DECREF(repunicode);
7163 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7164 return -1;
7165 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007166 }
7167 *inpos = newpos;
7168 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007169 }
7170 return 0;
7171}
7172
Alexander Belopolsky40018472011-02-26 01:02:56 +00007173PyObject *
7174PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7175 Py_ssize_t size,
7176 PyObject *mapping,
7177 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007179 /* output object */
7180 PyObject *res = NULL;
7181 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007182 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007183 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007184 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007185 PyObject *errorHandler = NULL;
7186 PyObject *exc = NULL;
7187 /* the following variable is used for caching string comparisons
7188 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7189 * 3=ignore, 4=xmlcharrefreplace */
7190 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191
7192 /* Default to Latin-1 */
7193 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007194 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007196 /* allocate enough for a simple encoding without
7197 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007198 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199 if (res == NULL)
7200 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007201 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007204 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 /* try to encode it */
7206 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7207 if (x==enc_EXCEPTION) /* error */
7208 goto onError;
7209 if (x==enc_FAILED) { /* unencodable character */
7210 if (charmap_encoding_error(p, size, &inpos, mapping,
7211 &exc,
7212 &known_errorHandler, &errorHandler, errors,
7213 &res, &respos)) {
7214 goto onError;
7215 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007216 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 else
7218 /* done with this character => adjust input position */
7219 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007222 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007223 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007224 if (_PyBytes_Resize(&res, respos) < 0)
7225 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007226
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007227 Py_XDECREF(exc);
7228 Py_XDECREF(errorHandler);
7229 return res;
7230
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007232 Py_XDECREF(res);
7233 Py_XDECREF(exc);
7234 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 return NULL;
7236}
7237
Alexander Belopolsky40018472011-02-26 01:02:56 +00007238PyObject *
7239PyUnicode_AsCharmapString(PyObject *unicode,
7240 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241{
7242 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 PyErr_BadArgument();
7244 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 }
7246 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 PyUnicode_GET_SIZE(unicode),
7248 mapping,
7249 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250}
7251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007252/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007253static void
7254make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007255 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007256 Py_ssize_t startpos, Py_ssize_t endpos,
7257 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007259 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007260 *exceptionObject = _PyUnicodeTranslateError_Create(
7261 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262 }
7263 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7265 goto onError;
7266 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7267 goto onError;
7268 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7269 goto onError;
7270 return;
7271 onError:
7272 Py_DECREF(*exceptionObject);
7273 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274 }
7275}
7276
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007277/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007278static void
7279raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007280 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007281 Py_ssize_t startpos, Py_ssize_t endpos,
7282 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007283{
7284 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007285 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007286 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007288}
7289
7290/* error handling callback helper:
7291 build arguments, call the callback and check the arguments,
7292 put the result into newpos and return the replacement string, which
7293 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007294static PyObject *
7295unicode_translate_call_errorhandler(const char *errors,
7296 PyObject **errorHandler,
7297 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007298 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007299 Py_ssize_t startpos, Py_ssize_t endpos,
7300 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007301{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007302 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007303
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007304 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007305 PyObject *restuple;
7306 PyObject *resunicode;
7307
7308 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007310 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007312 }
7313
7314 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007315 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007316 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007318
7319 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007321 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007323 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007324 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 Py_DECREF(restuple);
7326 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007327 }
7328 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007329 &resunicode, &i_newpos)) {
7330 Py_DECREF(restuple);
7331 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007332 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007333 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007334 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007335 else
7336 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007337 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7339 Py_DECREF(restuple);
7340 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007341 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007342 Py_INCREF(resunicode);
7343 Py_DECREF(restuple);
7344 return resunicode;
7345}
7346
7347/* Lookup the character ch in the mapping and put the result in result,
7348 which must be decrefed by the caller.
7349 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007350static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007351charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007352{
Christian Heimes217cfd12007-12-02 14:31:20 +00007353 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007354 PyObject *x;
7355
7356 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007358 x = PyObject_GetItem(mapping, w);
7359 Py_DECREF(w);
7360 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7362 /* No mapping found means: use 1:1 mapping. */
7363 PyErr_Clear();
7364 *result = NULL;
7365 return 0;
7366 } else
7367 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007368 }
7369 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 *result = x;
7371 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007372 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007373 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 long value = PyLong_AS_LONG(x);
7375 long max = PyUnicode_GetMax();
7376 if (value < 0 || value > max) {
7377 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007378 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 Py_DECREF(x);
7380 return -1;
7381 }
7382 *result = x;
7383 return 0;
7384 }
7385 else if (PyUnicode_Check(x)) {
7386 *result = x;
7387 return 0;
7388 }
7389 else {
7390 /* wrong return value */
7391 PyErr_SetString(PyExc_TypeError,
7392 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007393 Py_DECREF(x);
7394 return -1;
7395 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007396}
7397/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 if not reallocate and adjust various state variables.
7399 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007400static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007401charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007403{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007404 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007405 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 /* exponentially overallocate to minimize reallocations */
7407 if (requiredsize < 2 * oldsize)
7408 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007409 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7410 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007412 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007413 }
7414 return 0;
7415}
7416/* lookup the character, put the result in the output string and adjust
7417 various state variables. Return a new reference to the object that
7418 was put in the output buffer in *result, or Py_None, if the mapping was
7419 undefined (in which case no character was written).
7420 The called must decref result.
7421 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007422static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007423charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7424 PyObject *mapping, Py_UCS4 **output,
7425 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007426 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007428 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7429 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007430 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007431 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007433 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007434 }
7435 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007437 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007439 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007440 }
7441 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007442 Py_ssize_t repsize;
7443 if (PyUnicode_READY(*res) == -1)
7444 return -1;
7445 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 if (repsize==1) {
7447 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007448 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 }
7450 else if (repsize!=0) {
7451 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007452 Py_ssize_t requiredsize = *opos +
7453 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007455 Py_ssize_t i;
7456 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007458 for(i = 0; i < repsize; i++)
7459 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007461 }
7462 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007464 return 0;
7465}
7466
Alexander Belopolsky40018472011-02-26 01:02:56 +00007467PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007468_PyUnicode_TranslateCharmap(PyObject *input,
7469 PyObject *mapping,
7470 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007472 /* input object */
7473 char *idata;
7474 Py_ssize_t size, i;
7475 int kind;
7476 /* output buffer */
7477 Py_UCS4 *output = NULL;
7478 Py_ssize_t osize;
7479 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007480 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007481 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007482 char *reason = "character maps to <undefined>";
7483 PyObject *errorHandler = NULL;
7484 PyObject *exc = NULL;
7485 /* the following variable is used for caching string comparisons
7486 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7487 * 3=ignore, 4=xmlcharrefreplace */
7488 int known_errorHandler = -1;
7489
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 PyErr_BadArgument();
7492 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007495 if (PyUnicode_READY(input) == -1)
7496 return NULL;
7497 idata = (char*)PyUnicode_DATA(input);
7498 kind = PyUnicode_KIND(input);
7499 size = PyUnicode_GET_LENGTH(input);
7500 i = 0;
7501
7502 if (size == 0) {
7503 Py_INCREF(input);
7504 return input;
7505 }
7506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007507 /* allocate enough for a simple 1:1 translation without
7508 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007509 osize = size;
7510 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7511 opos = 0;
7512 if (output == NULL) {
7513 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007517 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 /* try to encode it */
7519 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007520 if (charmaptranslate_output(input, i, mapping,
7521 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 Py_XDECREF(x);
7523 goto onError;
7524 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007525 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007527 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 else { /* untranslatable character */
7529 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7530 Py_ssize_t repsize;
7531 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007532 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007533 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007534 Py_ssize_t collstart = i;
7535 Py_ssize_t collend = i+1;
7536 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007539 while (collend < size) {
7540 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 goto onError;
7542 Py_XDECREF(x);
7543 if (x!=Py_None)
7544 break;
7545 ++collend;
7546 }
7547 /* cache callback name lookup
7548 * (if not done yet, i.e. it's the first error) */
7549 if (known_errorHandler==-1) {
7550 if ((errors==NULL) || (!strcmp(errors, "strict")))
7551 known_errorHandler = 1;
7552 else if (!strcmp(errors, "replace"))
7553 known_errorHandler = 2;
7554 else if (!strcmp(errors, "ignore"))
7555 known_errorHandler = 3;
7556 else if (!strcmp(errors, "xmlcharrefreplace"))
7557 known_errorHandler = 4;
7558 else
7559 known_errorHandler = 0;
7560 }
7561 switch (known_errorHandler) {
7562 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007563 raise_translate_exception(&exc, input, collstart,
7564 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007565 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 case 2: /* replace */
7567 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007568 for (coll = collstart; coll<collend; coll++)
7569 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 /* fall through */
7571 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007572 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 break;
7574 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007575 /* generate replacement (temporarily (mis)uses i) */
7576 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 char buffer[2+29+1+1];
7578 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007579 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7580 if (charmaptranslate_makespace(&output, &osize,
7581 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007582 goto onError;
7583 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007584 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007586 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 break;
7588 default:
7589 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007590 reason, input, &exc,
7591 collstart, collend, &newpos);
7592 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 goto onError;
7594 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007595 repsize = PyUnicode_GET_LENGTH(repunicode);
7596 if (charmaptranslate_makespace(&output, &osize,
7597 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 Py_DECREF(repunicode);
7599 goto onError;
7600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007601 for (uni2 = 0; repsize-->0; ++uni2)
7602 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7603 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007605 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007606 }
7607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007608 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7609 if (!res)
7610 goto onError;
7611 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007612 Py_XDECREF(exc);
7613 Py_XDECREF(errorHandler);
7614 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007617 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007618 Py_XDECREF(exc);
7619 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 return NULL;
7621}
7622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007623/* Deprecated. Use PyUnicode_Translate instead. */
7624PyObject *
7625PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7626 Py_ssize_t size,
7627 PyObject *mapping,
7628 const char *errors)
7629{
7630 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7631 if (!unicode)
7632 return NULL;
7633 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7634}
7635
Alexander Belopolsky40018472011-02-26 01:02:56 +00007636PyObject *
7637PyUnicode_Translate(PyObject *str,
7638 PyObject *mapping,
7639 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640{
7641 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007642
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643 str = PyUnicode_FromObject(str);
7644 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007646 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647 Py_DECREF(str);
7648 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007649
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651 Py_XDECREF(str);
7652 return NULL;
7653}
Tim Petersced69f82003-09-16 20:30:58 +00007654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007655static Py_UCS4
7656fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7657{
7658 /* No need to call PyUnicode_READY(self) because this function is only
7659 called as a callback from fixup() which does it already. */
7660 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7661 const int kind = PyUnicode_KIND(self);
7662 void *data = PyUnicode_DATA(self);
7663 Py_UCS4 maxchar = 0, ch, fixed;
7664 Py_ssize_t i;
7665
7666 for (i = 0; i < len; ++i) {
7667 ch = PyUnicode_READ(kind, data, i);
7668 fixed = 0;
7669 if (ch > 127) {
7670 if (Py_UNICODE_ISSPACE(ch))
7671 fixed = ' ';
7672 else {
7673 const int decimal = Py_UNICODE_TODECIMAL(ch);
7674 if (decimal >= 0)
7675 fixed = '0' + decimal;
7676 }
7677 if (fixed != 0) {
7678 if (fixed > maxchar)
7679 maxchar = fixed;
7680 PyUnicode_WRITE(kind, data, i, fixed);
7681 }
7682 else if (ch > maxchar)
7683 maxchar = ch;
7684 }
7685 else if (ch > maxchar)
7686 maxchar = ch;
7687 }
7688
7689 return maxchar;
7690}
7691
7692PyObject *
7693_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7694{
7695 if (!PyUnicode_Check(unicode)) {
7696 PyErr_BadInternalCall();
7697 return NULL;
7698 }
7699 if (PyUnicode_READY(unicode) == -1)
7700 return NULL;
7701 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7702 /* If the string is already ASCII, just return the same string */
7703 Py_INCREF(unicode);
7704 return unicode;
7705 }
7706 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7707}
7708
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007709PyObject *
7710PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7711 Py_ssize_t length)
7712{
7713 PyObject *result;
7714 Py_UNICODE *p; /* write pointer into result */
7715 Py_ssize_t i;
7716 /* Copy to a new string */
7717 result = (PyObject *)_PyUnicode_New(length);
7718 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7719 if (result == NULL)
7720 return result;
7721 p = PyUnicode_AS_UNICODE(result);
7722 /* Iterate over code points */
7723 for (i = 0; i < length; i++) {
7724 Py_UNICODE ch =s[i];
7725 if (ch > 127) {
7726 int decimal = Py_UNICODE_TODECIMAL(ch);
7727 if (decimal >= 0)
7728 p[i] = '0' + decimal;
7729 }
7730 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007731 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7732 Py_DECREF(result);
7733 return NULL;
7734 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007735 return result;
7736}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007737/* --- Decimal Encoder ---------------------------------------------------- */
7738
Alexander Belopolsky40018472011-02-26 01:02:56 +00007739int
7740PyUnicode_EncodeDecimal(Py_UNICODE *s,
7741 Py_ssize_t length,
7742 char *output,
7743 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007744{
7745 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007746 PyObject *errorHandler = NULL;
7747 PyObject *exc = NULL;
7748 const char *encoding = "decimal";
7749 const char *reason = "invalid decimal Unicode string";
7750 /* the following variable is used for caching string comparisons
7751 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7752 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007753
7754 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 PyErr_BadArgument();
7756 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007757 }
7758
7759 p = s;
7760 end = s + length;
7761 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 register Py_UNICODE ch = *p;
7763 int decimal;
7764 PyObject *repunicode;
7765 Py_ssize_t repsize;
7766 Py_ssize_t newpos;
7767 Py_UNICODE *uni2;
7768 Py_UNICODE *collstart;
7769 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007770
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007772 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 ++p;
7774 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007775 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 decimal = Py_UNICODE_TODECIMAL(ch);
7777 if (decimal >= 0) {
7778 *output++ = '0' + decimal;
7779 ++p;
7780 continue;
7781 }
7782 if (0 < ch && ch < 256) {
7783 *output++ = (char)ch;
7784 ++p;
7785 continue;
7786 }
7787 /* All other characters are considered unencodable */
7788 collstart = p;
7789 collend = p+1;
7790 while (collend < end) {
7791 if ((0 < *collend && *collend < 256) ||
7792 !Py_UNICODE_ISSPACE(*collend) ||
7793 Py_UNICODE_TODECIMAL(*collend))
7794 break;
7795 }
7796 /* cache callback name lookup
7797 * (if not done yet, i.e. it's the first error) */
7798 if (known_errorHandler==-1) {
7799 if ((errors==NULL) || (!strcmp(errors, "strict")))
7800 known_errorHandler = 1;
7801 else if (!strcmp(errors, "replace"))
7802 known_errorHandler = 2;
7803 else if (!strcmp(errors, "ignore"))
7804 known_errorHandler = 3;
7805 else if (!strcmp(errors, "xmlcharrefreplace"))
7806 known_errorHandler = 4;
7807 else
7808 known_errorHandler = 0;
7809 }
7810 switch (known_errorHandler) {
7811 case 1: /* strict */
7812 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7813 goto onError;
7814 case 2: /* replace */
7815 for (p = collstart; p < collend; ++p)
7816 *output++ = '?';
7817 /* fall through */
7818 case 3: /* ignore */
7819 p = collend;
7820 break;
7821 case 4: /* xmlcharrefreplace */
7822 /* generate replacement (temporarily (mis)uses p) */
7823 for (p = collstart; p < collend; ++p)
7824 output += sprintf(output, "&#%d;", (int)*p);
7825 p = collend;
7826 break;
7827 default:
7828 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7829 encoding, reason, s, length, &exc,
7830 collstart-s, collend-s, &newpos);
7831 if (repunicode == NULL)
7832 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007833 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007834 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007835 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7836 Py_DECREF(repunicode);
7837 goto onError;
7838 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 /* generate replacement */
7840 repsize = PyUnicode_GET_SIZE(repunicode);
7841 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7842 Py_UNICODE ch = *uni2;
7843 if (Py_UNICODE_ISSPACE(ch))
7844 *output++ = ' ';
7845 else {
7846 decimal = Py_UNICODE_TODECIMAL(ch);
7847 if (decimal >= 0)
7848 *output++ = '0' + decimal;
7849 else if (0 < ch && ch < 256)
7850 *output++ = (char)ch;
7851 else {
7852 Py_DECREF(repunicode);
7853 raise_encode_exception(&exc, encoding,
7854 s, length, collstart-s, collend-s, reason);
7855 goto onError;
7856 }
7857 }
7858 }
7859 p = s + newpos;
7860 Py_DECREF(repunicode);
7861 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007862 }
7863 /* 0-terminate the output string */
7864 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007865 Py_XDECREF(exc);
7866 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007867 return 0;
7868
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007870 Py_XDECREF(exc);
7871 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007872 return -1;
7873}
7874
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875/* --- Helpers ------------------------------------------------------------ */
7876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877#include "stringlib/ucs1lib.h"
7878#include "stringlib/fastsearch.h"
7879#include "stringlib/partition.h"
7880#include "stringlib/split.h"
7881#include "stringlib/count.h"
7882#include "stringlib/find.h"
7883#include "stringlib/localeutil.h"
7884#include "stringlib/undef.h"
7885
7886#include "stringlib/ucs2lib.h"
7887#include "stringlib/fastsearch.h"
7888#include "stringlib/partition.h"
7889#include "stringlib/split.h"
7890#include "stringlib/count.h"
7891#include "stringlib/find.h"
7892#include "stringlib/localeutil.h"
7893#include "stringlib/undef.h"
7894
7895#include "stringlib/ucs4lib.h"
7896#include "stringlib/fastsearch.h"
7897#include "stringlib/partition.h"
7898#include "stringlib/split.h"
7899#include "stringlib/count.h"
7900#include "stringlib/find.h"
7901#include "stringlib/localeutil.h"
7902#include "stringlib/undef.h"
7903
7904static Py_ssize_t
7905any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7906 const Py_UCS1*, Py_ssize_t,
7907 Py_ssize_t, Py_ssize_t),
7908 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7909 const Py_UCS2*, Py_ssize_t,
7910 Py_ssize_t, Py_ssize_t),
7911 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7912 const Py_UCS4*, Py_ssize_t,
7913 Py_ssize_t, Py_ssize_t),
7914 PyObject* s1, PyObject* s2,
7915 Py_ssize_t start,
7916 Py_ssize_t end)
7917{
7918 int kind1, kind2, kind;
7919 void *buf1, *buf2;
7920 Py_ssize_t len1, len2, result;
7921
7922 kind1 = PyUnicode_KIND(s1);
7923 kind2 = PyUnicode_KIND(s2);
7924 kind = kind1 > kind2 ? kind1 : kind2;
7925 buf1 = PyUnicode_DATA(s1);
7926 buf2 = PyUnicode_DATA(s2);
7927 if (kind1 != kind)
7928 buf1 = _PyUnicode_AsKind(s1, kind);
7929 if (!buf1)
7930 return -2;
7931 if (kind2 != kind)
7932 buf2 = _PyUnicode_AsKind(s2, kind);
7933 if (!buf2) {
7934 if (kind1 != kind) PyMem_Free(buf1);
7935 return -2;
7936 }
7937 len1 = PyUnicode_GET_LENGTH(s1);
7938 len2 = PyUnicode_GET_LENGTH(s2);
7939
7940 switch(kind) {
7941 case PyUnicode_1BYTE_KIND:
7942 result = ucs1(buf1, len1, buf2, len2, start, end);
7943 break;
7944 case PyUnicode_2BYTE_KIND:
7945 result = ucs2(buf1, len1, buf2, len2, start, end);
7946 break;
7947 case PyUnicode_4BYTE_KIND:
7948 result = ucs4(buf1, len1, buf2, len2, start, end);
7949 break;
7950 default:
7951 assert(0); result = -2;
7952 }
7953
7954 if (kind1 != kind)
7955 PyMem_Free(buf1);
7956 if (kind2 != kind)
7957 PyMem_Free(buf2);
7958
7959 return result;
7960}
7961
7962Py_ssize_t
7963_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7964 Py_ssize_t n_buffer,
7965 void *digits, Py_ssize_t n_digits,
7966 Py_ssize_t min_width,
7967 const char *grouping,
7968 const char *thousands_sep)
7969{
7970 switch(kind) {
7971 case PyUnicode_1BYTE_KIND:
7972 return _PyUnicode_ucs1_InsertThousandsGrouping(
7973 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7974 min_width, grouping, thousands_sep);
7975 case PyUnicode_2BYTE_KIND:
7976 return _PyUnicode_ucs2_InsertThousandsGrouping(
7977 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7978 min_width, grouping, thousands_sep);
7979 case PyUnicode_4BYTE_KIND:
7980 return _PyUnicode_ucs4_InsertThousandsGrouping(
7981 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7982 min_width, grouping, thousands_sep);
7983 }
7984 assert(0);
7985 return -1;
7986}
7987
7988
Eric Smith8c663262007-08-25 02:26:07 +00007989#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007990#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007991
Thomas Wouters477c8d52006-05-27 19:21:47 +00007992#include "stringlib/count.h"
7993#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007994
Thomas Wouters477c8d52006-05-27 19:21:47 +00007995/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007996#define ADJUST_INDICES(start, end, len) \
7997 if (end > len) \
7998 end = len; \
7999 else if (end < 0) { \
8000 end += len; \
8001 if (end < 0) \
8002 end = 0; \
8003 } \
8004 if (start < 0) { \
8005 start += len; \
8006 if (start < 0) \
8007 start = 0; \
8008 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008009
Alexander Belopolsky40018472011-02-26 01:02:56 +00008010Py_ssize_t
8011PyUnicode_Count(PyObject *str,
8012 PyObject *substr,
8013 Py_ssize_t start,
8014 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008016 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008017 PyUnicodeObject* str_obj;
8018 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008019 int kind1, kind2, kind;
8020 void *buf1 = NULL, *buf2 = NULL;
8021 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008022
Thomas Wouters477c8d52006-05-27 19:21:47 +00008023 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008024 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008026 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008027 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 Py_DECREF(str_obj);
8029 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 }
Tim Petersced69f82003-09-16 20:30:58 +00008031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008032 kind1 = PyUnicode_KIND(str_obj);
8033 kind2 = PyUnicode_KIND(sub_obj);
8034 kind = kind1 > kind2 ? kind1 : kind2;
8035 buf1 = PyUnicode_DATA(str_obj);
8036 if (kind1 != kind)
8037 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8038 if (!buf1)
8039 goto onError;
8040 buf2 = PyUnicode_DATA(sub_obj);
8041 if (kind2 != kind)
8042 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8043 if (!buf2)
8044 goto onError;
8045 len1 = PyUnicode_GET_LENGTH(str_obj);
8046 len2 = PyUnicode_GET_LENGTH(sub_obj);
8047
8048 ADJUST_INDICES(start, end, len1);
8049 switch(kind) {
8050 case PyUnicode_1BYTE_KIND:
8051 result = ucs1lib_count(
8052 ((Py_UCS1*)buf1) + start, end - start,
8053 buf2, len2, PY_SSIZE_T_MAX
8054 );
8055 break;
8056 case PyUnicode_2BYTE_KIND:
8057 result = ucs2lib_count(
8058 ((Py_UCS2*)buf1) + start, end - start,
8059 buf2, len2, PY_SSIZE_T_MAX
8060 );
8061 break;
8062 case PyUnicode_4BYTE_KIND:
8063 result = ucs4lib_count(
8064 ((Py_UCS4*)buf1) + start, end - start,
8065 buf2, len2, PY_SSIZE_T_MAX
8066 );
8067 break;
8068 default:
8069 assert(0); result = 0;
8070 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008071
8072 Py_DECREF(sub_obj);
8073 Py_DECREF(str_obj);
8074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008075 if (kind1 != kind)
8076 PyMem_Free(buf1);
8077 if (kind2 != kind)
8078 PyMem_Free(buf2);
8079
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008081 onError:
8082 Py_DECREF(sub_obj);
8083 Py_DECREF(str_obj);
8084 if (kind1 != kind && buf1)
8085 PyMem_Free(buf1);
8086 if (kind2 != kind && buf2)
8087 PyMem_Free(buf2);
8088 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089}
8090
Alexander Belopolsky40018472011-02-26 01:02:56 +00008091Py_ssize_t
8092PyUnicode_Find(PyObject *str,
8093 PyObject *sub,
8094 Py_ssize_t start,
8095 Py_ssize_t end,
8096 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008098 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008099
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008101 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008103 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008104 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 Py_DECREF(str);
8106 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 }
Tim Petersced69f82003-09-16 20:30:58 +00008108
Thomas Wouters477c8d52006-05-27 19:21:47 +00008109 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008110 result = any_find_slice(
8111 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8112 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008113 );
8114 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008115 result = any_find_slice(
8116 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8117 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008118 );
8119
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008121 Py_DECREF(sub);
8122
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 return result;
8124}
8125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008126Py_ssize_t
8127PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8128 Py_ssize_t start, Py_ssize_t end,
8129 int direction)
8130{
8131 char *result;
8132 int kind;
8133 if (PyUnicode_READY(str) == -1)
8134 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008135 if (start < 0 || end < 0) {
8136 PyErr_SetString(PyExc_IndexError, "string index out of range");
8137 return -2;
8138 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008139 if (end > PyUnicode_GET_LENGTH(str))
8140 end = PyUnicode_GET_LENGTH(str);
8141 kind = PyUnicode_KIND(str);
8142 result = findchar(PyUnicode_1BYTE_DATA(str)
8143 + PyUnicode_KIND_SIZE(kind, start),
8144 kind,
8145 end-start, ch, direction);
8146 if (!result)
8147 return -1;
8148 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8149}
8150
Alexander Belopolsky40018472011-02-26 01:02:56 +00008151static int
8152tailmatch(PyUnicodeObject *self,
8153 PyUnicodeObject *substring,
8154 Py_ssize_t start,
8155 Py_ssize_t end,
8156 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 int kind_self;
8159 int kind_sub;
8160 void *data_self;
8161 void *data_sub;
8162 Py_ssize_t offset;
8163 Py_ssize_t i;
8164 Py_ssize_t end_sub;
8165
8166 if (PyUnicode_READY(self) == -1 ||
8167 PyUnicode_READY(substring) == -1)
8168 return 0;
8169
8170 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171 return 1;
8172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008173 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8174 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008178 kind_self = PyUnicode_KIND(self);
8179 data_self = PyUnicode_DATA(self);
8180 kind_sub = PyUnicode_KIND(substring);
8181 data_sub = PyUnicode_DATA(substring);
8182 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8183
8184 if (direction > 0)
8185 offset = end;
8186 else
8187 offset = start;
8188
8189 if (PyUnicode_READ(kind_self, data_self, offset) ==
8190 PyUnicode_READ(kind_sub, data_sub, 0) &&
8191 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8192 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8193 /* If both are of the same kind, memcmp is sufficient */
8194 if (kind_self == kind_sub) {
8195 return ! memcmp((char *)data_self +
8196 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8197 data_sub,
8198 PyUnicode_GET_LENGTH(substring) *
8199 PyUnicode_CHARACTER_SIZE(substring));
8200 }
8201 /* otherwise we have to compare each character by first accesing it */
8202 else {
8203 /* We do not need to compare 0 and len(substring)-1 because
8204 the if statement above ensured already that they are equal
8205 when we end up here. */
8206 // TODO: honor direction and do a forward or backwards search
8207 for (i = 1; i < end_sub; ++i) {
8208 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8209 PyUnicode_READ(kind_sub, data_sub, i))
8210 return 0;
8211 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214 }
8215
8216 return 0;
8217}
8218
Alexander Belopolsky40018472011-02-26 01:02:56 +00008219Py_ssize_t
8220PyUnicode_Tailmatch(PyObject *str,
8221 PyObject *substr,
8222 Py_ssize_t start,
8223 Py_ssize_t end,
8224 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008226 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008227
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228 str = PyUnicode_FromObject(str);
8229 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231 substr = PyUnicode_FromObject(substr);
8232 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 Py_DECREF(str);
8234 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235 }
Tim Petersced69f82003-09-16 20:30:58 +00008236
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 (PyUnicodeObject *)substr,
8239 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240 Py_DECREF(str);
8241 Py_DECREF(substr);
8242 return result;
8243}
8244
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245/* Apply fixfct filter to the Unicode object self and return a
8246 reference to the modified object */
8247
Alexander Belopolsky40018472011-02-26 01:02:56 +00008248static PyObject *
8249fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008250 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 PyObject *u;
8253 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008255 if (PyUnicode_READY(self) == -1)
8256 return NULL;
8257 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8258 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8259 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008263 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8264 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008266 /* fix functions return the new maximum character in a string,
8267 if the kind of the resulting unicode object does not change,
8268 everything is fine. Otherwise we need to change the string kind
8269 and re-run the fix function. */
8270 maxchar_new = fixfct((PyUnicodeObject*)u);
8271 if (maxchar_new == 0)
8272 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8273 else if (maxchar_new <= 127)
8274 maxchar_new = 127;
8275 else if (maxchar_new <= 255)
8276 maxchar_new = 255;
8277 else if (maxchar_new <= 65535)
8278 maxchar_new = 65535;
8279 else
8280 maxchar_new = 1114111; /* 0x10ffff */
8281
8282 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 /* fixfct should return TRUE if it modified the buffer. If
8284 FALSE, return a reference to the original buffer instead
8285 (to save space, not time) */
8286 Py_INCREF(self);
8287 Py_DECREF(u);
8288 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008290 else if (maxchar_new == maxchar_old) {
8291 return u;
8292 }
8293 else {
8294 /* In case the maximum character changed, we need to
8295 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008296 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008297 if (v == NULL) {
8298 Py_DECREF(u);
8299 return NULL;
8300 }
8301 if (maxchar_new > maxchar_old) {
8302 /* If the maxchar increased so that the kind changed, not all
8303 characters are representable anymore and we need to fix the
8304 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008305 if (PyUnicode_CopyCharacters(v, 0,
8306 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008307 PyUnicode_GET_LENGTH(self)) < 0)
8308 {
8309 Py_DECREF(u);
8310 return NULL;
8311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312 maxchar_old = fixfct((PyUnicodeObject*)v);
8313 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8314 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008315 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008316 if (PyUnicode_CopyCharacters(v, 0,
8317 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008318 PyUnicode_GET_LENGTH(self)) < 0)
8319 {
8320 Py_DECREF(u);
8321 return NULL;
8322 }
8323 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324
8325 Py_DECREF(u);
8326 return v;
8327 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328}
8329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008330static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008331fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008333 /* No need to call PyUnicode_READY(self) because this function is only
8334 called as a callback from fixup() which does it already. */
8335 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8336 const int kind = PyUnicode_KIND(self);
8337 void *data = PyUnicode_DATA(self);
8338 int touched = 0;
8339 Py_UCS4 maxchar = 0;
8340 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342 for (i = 0; i < len; ++i) {
8343 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8344 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8345 if (up != ch) {
8346 if (up > maxchar)
8347 maxchar = up;
8348 PyUnicode_WRITE(kind, data, i, up);
8349 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 else if (ch > maxchar)
8352 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 }
8354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 if (touched)
8356 return maxchar;
8357 else
8358 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359}
8360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008362fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8365 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8366 const int kind = PyUnicode_KIND(self);
8367 void *data = PyUnicode_DATA(self);
8368 int touched = 0;
8369 Py_UCS4 maxchar = 0;
8370 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 for(i = 0; i < len; ++i) {
8373 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8374 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8375 if (lo != ch) {
8376 if (lo > maxchar)
8377 maxchar = lo;
8378 PyUnicode_WRITE(kind, data, i, lo);
8379 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 else if (ch > maxchar)
8382 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 }
8384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 if (touched)
8386 return maxchar;
8387 else
8388 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389}
8390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008392fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8395 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8396 const int kind = PyUnicode_KIND(self);
8397 void *data = PyUnicode_DATA(self);
8398 int touched = 0;
8399 Py_UCS4 maxchar = 0;
8400 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 for(i = 0; i < len; ++i) {
8403 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8404 Py_UCS4 nu = 0;
8405
8406 if (Py_UNICODE_ISUPPER(ch))
8407 nu = Py_UNICODE_TOLOWER(ch);
8408 else if (Py_UNICODE_ISLOWER(ch))
8409 nu = Py_UNICODE_TOUPPER(ch);
8410
8411 if (nu != 0) {
8412 if (nu > maxchar)
8413 maxchar = nu;
8414 PyUnicode_WRITE(kind, data, i, nu);
8415 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 else if (ch > maxchar)
8418 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419 }
8420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008421 if (touched)
8422 return maxchar;
8423 else
8424 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425}
8426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008428fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008430 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8431 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8432 const int kind = PyUnicode_KIND(self);
8433 void *data = PyUnicode_DATA(self);
8434 int touched = 0;
8435 Py_UCS4 maxchar = 0;
8436 Py_ssize_t i = 0;
8437 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008438
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008439 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008441
8442 ch = PyUnicode_READ(kind, data, i);
8443 if (!Py_UNICODE_ISUPPER(ch)) {
8444 maxchar = Py_UNICODE_TOUPPER(ch);
8445 PyUnicode_WRITE(kind, data, i, maxchar);
8446 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 ++i;
8449 for(; i < len; ++i) {
8450 ch = PyUnicode_READ(kind, data, i);
8451 if (!Py_UNICODE_ISLOWER(ch)) {
8452 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8453 if (lo > maxchar)
8454 maxchar = lo;
8455 PyUnicode_WRITE(kind, data, i, lo);
8456 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008458 else if (ch > maxchar)
8459 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008460 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461
8462 if (touched)
8463 return maxchar;
8464 else
8465 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466}
8467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008469fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8472 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8473 const int kind = PyUnicode_KIND(self);
8474 void *data = PyUnicode_DATA(self);
8475 Py_UCS4 maxchar = 0;
8476 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 int previous_is_cased;
8478
8479 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 if (len == 1) {
8481 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8482 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8483 if (ti != ch) {
8484 PyUnicode_WRITE(kind, data, i, ti);
8485 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 }
8487 else
8488 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491 for(; i < len; ++i) {
8492 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8493 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008494
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 nu = Py_UNICODE_TOTITLE(ch);
8499
8500 if (nu > maxchar)
8501 maxchar = nu;
8502 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008503
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 if (Py_UNICODE_ISLOWER(ch) ||
8505 Py_UNICODE_ISUPPER(ch) ||
8506 Py_UNICODE_ISTITLE(ch))
8507 previous_is_cased = 1;
8508 else
8509 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512}
8513
Tim Peters8ce9f162004-08-27 01:49:32 +00008514PyObject *
8515PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008518 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008520 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008521 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8522 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008523 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008524 Py_ssize_t sz, i, res_offset;
8525 Py_UCS4 maxchar = 0;
8526 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527
Tim Peters05eba1f2004-08-27 21:32:02 +00008528 fseq = PySequence_Fast(seq, "");
8529 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008530 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008531 }
8532
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008533 /* NOTE: the following code can't call back into Python code,
8534 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008535 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008536
Tim Peters05eba1f2004-08-27 21:32:02 +00008537 seqlen = PySequence_Fast_GET_SIZE(fseq);
8538 /* If empty sequence, return u"". */
8539 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008541 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008542 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008543 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008544 /* If singleton sequence with an exact Unicode, return that. */
8545 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 item = items[0];
8547 if (PyUnicode_CheckExact(item)) {
8548 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 goto Done;
8551 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008552 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008553 else {
8554 /* Set up sep and seplen */
8555 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556 /* fall back to a blank space separator */
8557 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008558 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008560 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008561 else {
8562 if (!PyUnicode_Check(separator)) {
8563 PyErr_Format(PyExc_TypeError,
8564 "separator: expected str instance,"
8565 " %.80s found",
8566 Py_TYPE(separator)->tp_name);
8567 goto onError;
8568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569 if (PyUnicode_READY(separator) == -1)
8570 goto onError;
8571 sep = separator;
8572 seplen = PyUnicode_GET_LENGTH(separator);
8573 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8574 /* inc refcount to keep this code path symetric with the
8575 above case of a blank separator */
8576 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008577 }
8578 }
8579
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008580 /* There are at least two things to join, or else we have a subclass
8581 * of str in the sequence.
8582 * Do a pre-pass to figure out the total amount of space we'll
8583 * need (sz), and see whether all argument are strings.
8584 */
8585 sz = 0;
8586 for (i = 0; i < seqlen; i++) {
8587 const Py_ssize_t old_sz = sz;
8588 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 if (!PyUnicode_Check(item)) {
8590 PyErr_Format(PyExc_TypeError,
8591 "sequence item %zd: expected str instance,"
8592 " %.80s found",
8593 i, Py_TYPE(item)->tp_name);
8594 goto onError;
8595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 if (PyUnicode_READY(item) == -1)
8597 goto onError;
8598 sz += PyUnicode_GET_LENGTH(item);
8599 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8600 if (item_maxchar > maxchar)
8601 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008602 if (i != 0)
8603 sz += seplen;
8604 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8605 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008607 goto onError;
8608 }
8609 }
Tim Petersced69f82003-09-16 20:30:58 +00008610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008612 if (res == NULL)
8613 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008614
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008615 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008617 Py_ssize_t itemlen;
8618 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 /* Copy item, and maybe the separator. */
8621 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008622 if (PyUnicode_CopyCharacters(res, res_offset,
8623 sep, 0, seplen) < 0)
8624 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008627 if (PyUnicode_CopyCharacters(res, res_offset,
8628 item, 0, itemlen) < 0)
8629 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008633
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008635 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 Py_XDECREF(sep);
8637 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008640 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008642 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 return NULL;
8644}
8645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646#define FILL(kind, data, value, start, length) \
8647 do { \
8648 Py_ssize_t i_ = 0; \
8649 assert(kind != PyUnicode_WCHAR_KIND); \
8650 switch ((kind)) { \
8651 case PyUnicode_1BYTE_KIND: { \
8652 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8653 memset(to_, (unsigned char)value, length); \
8654 break; \
8655 } \
8656 case PyUnicode_2BYTE_KIND: { \
8657 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8658 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8659 break; \
8660 } \
8661 default: { \
8662 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8663 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8664 break; \
8665 } \
8666 } \
8667 } while (0)
8668
Alexander Belopolsky40018472011-02-26 01:02:56 +00008669static PyUnicodeObject *
8670pad(PyUnicodeObject *self,
8671 Py_ssize_t left,
8672 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 PyObject *u;
8676 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008677 int kind;
8678 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679
8680 if (left < 0)
8681 left = 0;
8682 if (right < 0)
8683 right = 0;
8684
Tim Peters7a29bd52001-09-12 03:03:31 +00008685 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 Py_INCREF(self);
8687 return self;
8688 }
8689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8691 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008692 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8693 return NULL;
8694 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8696 if (fill > maxchar)
8697 maxchar = fill;
8698 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008699 if (!u)
8700 return NULL;
8701
8702 kind = PyUnicode_KIND(u);
8703 data = PyUnicode_DATA(u);
8704 if (left)
8705 FILL(kind, data, fill, 0, left);
8706 if (right)
8707 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008708 if (PyUnicode_CopyCharacters(u, left,
8709 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008710 _PyUnicode_LENGTH(self)) < 0)
8711 {
8712 Py_DECREF(u);
8713 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714 }
8715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719
Alexander Belopolsky40018472011-02-26 01:02:56 +00008720PyObject *
8721PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724
8725 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 switch(PyUnicode_KIND(string)) {
8730 case PyUnicode_1BYTE_KIND:
8731 list = ucs1lib_splitlines(
8732 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8733 PyUnicode_GET_LENGTH(string), keepends);
8734 break;
8735 case PyUnicode_2BYTE_KIND:
8736 list = ucs2lib_splitlines(
8737 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8738 PyUnicode_GET_LENGTH(string), keepends);
8739 break;
8740 case PyUnicode_4BYTE_KIND:
8741 list = ucs4lib_splitlines(
8742 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8743 PyUnicode_GET_LENGTH(string), keepends);
8744 break;
8745 default:
8746 assert(0);
8747 list = 0;
8748 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749 Py_DECREF(string);
8750 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751}
8752
Alexander Belopolsky40018472011-02-26 01:02:56 +00008753static PyObject *
8754split(PyUnicodeObject *self,
8755 PyUnicodeObject *substring,
8756 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758 int kind1, kind2, kind;
8759 void *buf1, *buf2;
8760 Py_ssize_t len1, len2;
8761 PyObject* out;
8762
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008764 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008766 if (PyUnicode_READY(self) == -1)
8767 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769 if (substring == NULL)
8770 switch(PyUnicode_KIND(self)) {
8771 case PyUnicode_1BYTE_KIND:
8772 return ucs1lib_split_whitespace(
8773 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8774 PyUnicode_GET_LENGTH(self), maxcount
8775 );
8776 case PyUnicode_2BYTE_KIND:
8777 return ucs2lib_split_whitespace(
8778 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8779 PyUnicode_GET_LENGTH(self), maxcount
8780 );
8781 case PyUnicode_4BYTE_KIND:
8782 return ucs4lib_split_whitespace(
8783 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8784 PyUnicode_GET_LENGTH(self), maxcount
8785 );
8786 default:
8787 assert(0);
8788 return NULL;
8789 }
8790
8791 if (PyUnicode_READY(substring) == -1)
8792 return NULL;
8793
8794 kind1 = PyUnicode_KIND(self);
8795 kind2 = PyUnicode_KIND(substring);
8796 kind = kind1 > kind2 ? kind1 : kind2;
8797 buf1 = PyUnicode_DATA(self);
8798 buf2 = PyUnicode_DATA(substring);
8799 if (kind1 != kind)
8800 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8801 if (!buf1)
8802 return NULL;
8803 if (kind2 != kind)
8804 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8805 if (!buf2) {
8806 if (kind1 != kind) PyMem_Free(buf1);
8807 return NULL;
8808 }
8809 len1 = PyUnicode_GET_LENGTH(self);
8810 len2 = PyUnicode_GET_LENGTH(substring);
8811
8812 switch(kind) {
8813 case PyUnicode_1BYTE_KIND:
8814 out = ucs1lib_split(
8815 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8816 break;
8817 case PyUnicode_2BYTE_KIND:
8818 out = ucs2lib_split(
8819 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8820 break;
8821 case PyUnicode_4BYTE_KIND:
8822 out = ucs4lib_split(
8823 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8824 break;
8825 default:
8826 out = NULL;
8827 }
8828 if (kind1 != kind)
8829 PyMem_Free(buf1);
8830 if (kind2 != kind)
8831 PyMem_Free(buf2);
8832 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833}
8834
Alexander Belopolsky40018472011-02-26 01:02:56 +00008835static PyObject *
8836rsplit(PyUnicodeObject *self,
8837 PyUnicodeObject *substring,
8838 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 int kind1, kind2, kind;
8841 void *buf1, *buf2;
8842 Py_ssize_t len1, len2;
8843 PyObject* out;
8844
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008845 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008846 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 if (PyUnicode_READY(self) == -1)
8849 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 if (substring == NULL)
8852 switch(PyUnicode_KIND(self)) {
8853 case PyUnicode_1BYTE_KIND:
8854 return ucs1lib_rsplit_whitespace(
8855 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8856 PyUnicode_GET_LENGTH(self), maxcount
8857 );
8858 case PyUnicode_2BYTE_KIND:
8859 return ucs2lib_rsplit_whitespace(
8860 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8861 PyUnicode_GET_LENGTH(self), maxcount
8862 );
8863 case PyUnicode_4BYTE_KIND:
8864 return ucs4lib_rsplit_whitespace(
8865 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8866 PyUnicode_GET_LENGTH(self), maxcount
8867 );
8868 default:
8869 assert(0);
8870 return NULL;
8871 }
8872
8873 if (PyUnicode_READY(substring) == -1)
8874 return NULL;
8875
8876 kind1 = PyUnicode_KIND(self);
8877 kind2 = PyUnicode_KIND(substring);
8878 kind = kind1 > kind2 ? kind1 : kind2;
8879 buf1 = PyUnicode_DATA(self);
8880 buf2 = PyUnicode_DATA(substring);
8881 if (kind1 != kind)
8882 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8883 if (!buf1)
8884 return NULL;
8885 if (kind2 != kind)
8886 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8887 if (!buf2) {
8888 if (kind1 != kind) PyMem_Free(buf1);
8889 return NULL;
8890 }
8891 len1 = PyUnicode_GET_LENGTH(self);
8892 len2 = PyUnicode_GET_LENGTH(substring);
8893
8894 switch(kind) {
8895 case PyUnicode_1BYTE_KIND:
8896 out = ucs1lib_rsplit(
8897 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8898 break;
8899 case PyUnicode_2BYTE_KIND:
8900 out = ucs2lib_rsplit(
8901 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8902 break;
8903 case PyUnicode_4BYTE_KIND:
8904 out = ucs4lib_rsplit(
8905 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8906 break;
8907 default:
8908 out = NULL;
8909 }
8910 if (kind1 != kind)
8911 PyMem_Free(buf1);
8912 if (kind2 != kind)
8913 PyMem_Free(buf2);
8914 return out;
8915}
8916
8917static Py_ssize_t
8918anylib_find(int kind, void *buf1, Py_ssize_t len1,
8919 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8920{
8921 switch(kind) {
8922 case PyUnicode_1BYTE_KIND:
8923 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8924 case PyUnicode_2BYTE_KIND:
8925 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8926 case PyUnicode_4BYTE_KIND:
8927 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8928 }
8929 assert(0);
8930 return -1;
8931}
8932
8933static Py_ssize_t
8934anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8935 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8936{
8937 switch(kind) {
8938 case PyUnicode_1BYTE_KIND:
8939 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8940 case PyUnicode_2BYTE_KIND:
8941 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8942 case PyUnicode_4BYTE_KIND:
8943 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8944 }
8945 assert(0);
8946 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008947}
8948
Alexander Belopolsky40018472011-02-26 01:02:56 +00008949static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950replace(PyObject *self, PyObject *str1,
8951 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 PyObject *u;
8954 char *sbuf = PyUnicode_DATA(self);
8955 char *buf1 = PyUnicode_DATA(str1);
8956 char *buf2 = PyUnicode_DATA(str2);
8957 int srelease = 0, release1 = 0, release2 = 0;
8958 int skind = PyUnicode_KIND(self);
8959 int kind1 = PyUnicode_KIND(str1);
8960 int kind2 = PyUnicode_KIND(str2);
8961 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8962 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8963 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964
8965 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008966 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008968 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 if (skind < kind1)
8971 /* substring too wide to be present */
8972 goto nothing;
8973
8974 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008975 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008976 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008978 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008980 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 Py_UCS4 u1, u2, maxchar;
8982 int mayshrink, rkind;
8983 u1 = PyUnicode_READ_CHAR(str1, 0);
8984 if (!findchar(sbuf, PyUnicode_KIND(self),
8985 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008986 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987 u2 = PyUnicode_READ_CHAR(str2, 0);
8988 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8989 /* Replacing u1 with u2 may cause a maxchar reduction in the
8990 result string. */
8991 mayshrink = maxchar > 127;
8992 if (u2 > maxchar) {
8993 maxchar = u2;
8994 mayshrink = 0;
8995 }
8996 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008997 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008999 if (PyUnicode_CopyCharacters(u, 0,
9000 (PyObject*)self, 0, slen) < 0)
9001 {
9002 Py_DECREF(u);
9003 return NULL;
9004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 rkind = PyUnicode_KIND(u);
9006 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9007 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009008 if (--maxcount < 0)
9009 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009012 if (mayshrink) {
9013 PyObject *tmp = u;
9014 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9015 PyUnicode_GET_LENGTH(tmp));
9016 Py_DECREF(tmp);
9017 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009019 int rkind = skind;
9020 char *res;
9021 if (kind1 < rkind) {
9022 /* widen substring */
9023 buf1 = _PyUnicode_AsKind(str1, rkind);
9024 if (!buf1) goto error;
9025 release1 = 1;
9026 }
9027 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009028 if (i < 0)
9029 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 if (rkind > kind2) {
9031 /* widen replacement */
9032 buf2 = _PyUnicode_AsKind(str2, rkind);
9033 if (!buf2) goto error;
9034 release2 = 1;
9035 }
9036 else if (rkind < kind2) {
9037 /* widen self and buf1 */
9038 rkind = kind2;
9039 if (release1) PyMem_Free(buf1);
9040 sbuf = _PyUnicode_AsKind(self, rkind);
9041 if (!sbuf) goto error;
9042 srelease = 1;
9043 buf1 = _PyUnicode_AsKind(str1, rkind);
9044 if (!buf1) goto error;
9045 release1 = 1;
9046 }
9047 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9048 if (!res) {
9049 PyErr_NoMemory();
9050 goto error;
9051 }
9052 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009053 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9055 buf2,
9056 PyUnicode_KIND_SIZE(rkind, len2));
9057 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009058
9059 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9061 slen-i,
9062 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009063 if (i == -1)
9064 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009065 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9066 buf2,
9067 PyUnicode_KIND_SIZE(rkind, len2));
9068 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070
9071 u = PyUnicode_FromKindAndData(rkind, res, slen);
9072 PyMem_Free(res);
9073 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 Py_ssize_t n, i, j, ires;
9078 Py_ssize_t product, new_size;
9079 int rkind = skind;
9080 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 if (kind1 < rkind) {
9083 buf1 = _PyUnicode_AsKind(str1, rkind);
9084 if (!buf1) goto error;
9085 release1 = 1;
9086 }
9087 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009088 if (n == 0)
9089 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 if (kind2 < rkind) {
9091 buf2 = _PyUnicode_AsKind(str2, rkind);
9092 if (!buf2) goto error;
9093 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 else if (kind2 > rkind) {
9096 rkind = kind2;
9097 sbuf = _PyUnicode_AsKind(self, rkind);
9098 if (!sbuf) goto error;
9099 srelease = 1;
9100 if (release1) PyMem_Free(buf1);
9101 buf1 = _PyUnicode_AsKind(str1, rkind);
9102 if (!buf1) goto error;
9103 release1 = 1;
9104 }
9105 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9106 PyUnicode_GET_LENGTH(str1))); */
9107 product = n * (len2-len1);
9108 if ((product / (len2-len1)) != n) {
9109 PyErr_SetString(PyExc_OverflowError,
9110 "replace string is too long");
9111 goto error;
9112 }
9113 new_size = slen + product;
9114 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9115 PyErr_SetString(PyExc_OverflowError,
9116 "replace string is too long");
9117 goto error;
9118 }
9119 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9120 if (!res)
9121 goto error;
9122 ires = i = 0;
9123 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009124 while (n-- > 0) {
9125 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 j = anylib_find(rkind,
9127 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9128 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009129 if (j == -1)
9130 break;
9131 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009132 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9134 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9135 PyUnicode_KIND_SIZE(rkind, j-i));
9136 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009137 }
9138 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 if (len2 > 0) {
9140 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9141 buf2,
9142 PyUnicode_KIND_SIZE(rkind, len2));
9143 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009144 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009148 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009149 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9150 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9151 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009152 } else {
9153 /* interleave */
9154 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9156 buf2,
9157 PyUnicode_KIND_SIZE(rkind, len2));
9158 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009159 if (--n <= 0)
9160 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9162 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9163 PyUnicode_KIND_SIZE(rkind, 1));
9164 ires++;
9165 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9168 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9169 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009170 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009172 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 if (srelease)
9175 PyMem_FREE(sbuf);
9176 if (release1)
9177 PyMem_FREE(buf1);
9178 if (release2)
9179 PyMem_FREE(buf2);
9180 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009181
Benjamin Peterson29060642009-01-31 22:14:21 +00009182 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009183 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184 if (srelease)
9185 PyMem_FREE(sbuf);
9186 if (release1)
9187 PyMem_FREE(buf1);
9188 if (release2)
9189 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009190 if (PyUnicode_CheckExact(self)) {
9191 Py_INCREF(self);
9192 return (PyObject *) self;
9193 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009194 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195 error:
9196 if (srelease && sbuf)
9197 PyMem_FREE(sbuf);
9198 if (release1 && buf1)
9199 PyMem_FREE(buf1);
9200 if (release2 && buf2)
9201 PyMem_FREE(buf2);
9202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009203}
9204
9205/* --- Unicode Object Methods --------------------------------------------- */
9206
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009207PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009208 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009209\n\
9210Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009211characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009212
9213static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009214unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216 return fixup(self, fixtitle);
9217}
9218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009219PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221\n\
9222Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009223have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224
9225static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009226unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228 return fixup(self, fixcapitalize);
9229}
9230
9231#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009232PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009233 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234\n\
9235Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009236normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237
9238static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009239unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240{
9241 PyObject *list;
9242 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009243 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244
Guido van Rossumd57fd912000-03-10 22:53:23 +00009245 /* Split into words */
9246 list = split(self, NULL, -1);
9247 if (!list)
9248 return NULL;
9249
9250 /* Capitalize each word */
9251 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9252 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009253 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254 if (item == NULL)
9255 goto onError;
9256 Py_DECREF(PyList_GET_ITEM(list, i));
9257 PyList_SET_ITEM(list, i, item);
9258 }
9259
9260 /* Join the words to form a new string */
9261 item = PyUnicode_Join(NULL, list);
9262
Benjamin Peterson29060642009-01-31 22:14:21 +00009263 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 Py_DECREF(list);
9265 return (PyObject *)item;
9266}
9267#endif
9268
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009269/* Argument converter. Coerces to a single unicode character */
9270
9271static int
9272convert_uc(PyObject *obj, void *addr)
9273{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009275 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009276
Benjamin Peterson14339b62009-01-31 16:36:08 +00009277 uniobj = PyUnicode_FromObject(obj);
9278 if (uniobj == NULL) {
9279 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009280 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009281 return 0;
9282 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009284 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009285 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009286 Py_DECREF(uniobj);
9287 return 0;
9288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009290 Py_DECREF(uniobj);
9291 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009292}
9293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009294PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009295 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009297Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009298done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299
9300static PyObject *
9301unicode_center(PyUnicodeObject *self, PyObject *args)
9302{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009303 Py_ssize_t marg, left;
9304 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 Py_UCS4 fillchar = ' ';
9306
Victor Stinnere9a29352011-10-01 02:14:59 +02009307 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309
Victor Stinnere9a29352011-10-01 02:14:59 +02009310 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009311 return NULL;
9312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314 Py_INCREF(self);
9315 return (PyObject*) self;
9316 }
9317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319 left = marg / 2 + (marg & width & 1);
9320
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009321 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322}
9323
Marc-André Lemburge5034372000-08-08 08:04:29 +00009324#if 0
9325
9326/* This code should go into some future Unicode collation support
9327 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009328 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009329
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009330/* speedy UTF-16 code point order comparison */
9331/* gleaned from: */
9332/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9333
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009334static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009335{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009336 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009337 0, 0, 0, 0, 0, 0, 0, 0,
9338 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009339 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009340};
9341
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342static int
9343unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9344{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009345 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009346
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347 Py_UNICODE *s1 = str1->str;
9348 Py_UNICODE *s2 = str2->str;
9349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350 len1 = str1->_base._base.length;
9351 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009352
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009354 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009355
9356 c1 = *s1++;
9357 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009358
Benjamin Peterson29060642009-01-31 22:14:21 +00009359 if (c1 > (1<<11) * 26)
9360 c1 += utf16Fixup[c1>>11];
9361 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009362 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009363 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009364
9365 if (c1 != c2)
9366 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009367
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009368 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 }
9370
9371 return (len1 < len2) ? -1 : (len1 != len2);
9372}
9373
Marc-André Lemburge5034372000-08-08 08:04:29 +00009374#else
9375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376/* This function assumes that str1 and str2 are readied by the caller. */
9377
Marc-André Lemburge5034372000-08-08 08:04:29 +00009378static int
9379unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9380{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381 int kind1, kind2;
9382 void *data1, *data2;
9383 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 kind1 = PyUnicode_KIND(str1);
9386 kind2 = PyUnicode_KIND(str2);
9387 data1 = PyUnicode_DATA(str1);
9388 data2 = PyUnicode_DATA(str2);
9389 len1 = PyUnicode_GET_LENGTH(str1);
9390 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392 for (i = 0; i < len1 && i < len2; ++i) {
9393 Py_UCS4 c1, c2;
9394 c1 = PyUnicode_READ(kind1, data1, i);
9395 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009396
9397 if (c1 != c2)
9398 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009399 }
9400
9401 return (len1 < len2) ? -1 : (len1 != len2);
9402}
9403
9404#endif
9405
Alexander Belopolsky40018472011-02-26 01:02:56 +00009406int
9407PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9410 if (PyUnicode_READY(left) == -1 ||
9411 PyUnicode_READY(right) == -1)
9412 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009413 return unicode_compare((PyUnicodeObject *)left,
9414 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009416 PyErr_Format(PyExc_TypeError,
9417 "Can't compare %.100s and %.100s",
9418 left->ob_type->tp_name,
9419 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420 return -1;
9421}
9422
Martin v. Löwis5b222132007-06-10 09:51:05 +00009423int
9424PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 Py_ssize_t i;
9427 int kind;
9428 void *data;
9429 Py_UCS4 chr;
9430
Martin v. Löwis5b222132007-06-10 09:51:05 +00009431 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 if (PyUnicode_READY(uni) == -1)
9433 return -1;
9434 kind = PyUnicode_KIND(uni);
9435 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009436 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9438 if (chr != str[i])
9439 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009440 /* This check keeps Python strings that end in '\0' from comparing equal
9441 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009443 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009444 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009445 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009446 return 0;
9447}
9448
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009449
Benjamin Peterson29060642009-01-31 22:14:21 +00009450#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009451 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009452
Alexander Belopolsky40018472011-02-26 01:02:56 +00009453PyObject *
9454PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009455{
9456 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009457
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009458 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9459 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 if (PyUnicode_READY(left) == -1 ||
9461 PyUnicode_READY(right) == -1)
9462 return NULL;
9463 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9464 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009465 if (op == Py_EQ) {
9466 Py_INCREF(Py_False);
9467 return Py_False;
9468 }
9469 if (op == Py_NE) {
9470 Py_INCREF(Py_True);
9471 return Py_True;
9472 }
9473 }
9474 if (left == right)
9475 result = 0;
9476 else
9477 result = unicode_compare((PyUnicodeObject *)left,
9478 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009479
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009480 /* Convert the return value to a Boolean */
9481 switch (op) {
9482 case Py_EQ:
9483 v = TEST_COND(result == 0);
9484 break;
9485 case Py_NE:
9486 v = TEST_COND(result != 0);
9487 break;
9488 case Py_LE:
9489 v = TEST_COND(result <= 0);
9490 break;
9491 case Py_GE:
9492 v = TEST_COND(result >= 0);
9493 break;
9494 case Py_LT:
9495 v = TEST_COND(result == -1);
9496 break;
9497 case Py_GT:
9498 v = TEST_COND(result == 1);
9499 break;
9500 default:
9501 PyErr_BadArgument();
9502 return NULL;
9503 }
9504 Py_INCREF(v);
9505 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009506 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009507
Brian Curtindfc80e32011-08-10 20:28:54 -05009508 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009509}
9510
Alexander Belopolsky40018472011-02-26 01:02:56 +00009511int
9512PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009513{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009514 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515 int kind1, kind2, kind;
9516 void *buf1, *buf2;
9517 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009518 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009519
9520 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009521 sub = PyUnicode_FromObject(element);
9522 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009523 PyErr_Format(PyExc_TypeError,
9524 "'in <string>' requires string as left operand, not %s",
9525 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009526 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009527 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528 if (PyUnicode_READY(sub) == -1)
9529 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009530
Thomas Wouters477c8d52006-05-27 19:21:47 +00009531 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009532 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009533 Py_DECREF(sub);
9534 return -1;
9535 }
9536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 kind1 = PyUnicode_KIND(str);
9538 kind2 = PyUnicode_KIND(sub);
9539 kind = kind1 > kind2 ? kind1 : kind2;
9540 buf1 = PyUnicode_DATA(str);
9541 buf2 = PyUnicode_DATA(sub);
9542 if (kind1 != kind)
9543 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9544 if (!buf1) {
9545 Py_DECREF(sub);
9546 return -1;
9547 }
9548 if (kind2 != kind)
9549 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9550 if (!buf2) {
9551 Py_DECREF(sub);
9552 if (kind1 != kind) PyMem_Free(buf1);
9553 return -1;
9554 }
9555 len1 = PyUnicode_GET_LENGTH(str);
9556 len2 = PyUnicode_GET_LENGTH(sub);
9557
9558 switch(kind) {
9559 case PyUnicode_1BYTE_KIND:
9560 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9561 break;
9562 case PyUnicode_2BYTE_KIND:
9563 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9564 break;
9565 case PyUnicode_4BYTE_KIND:
9566 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9567 break;
9568 default:
9569 result = -1;
9570 assert(0);
9571 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009572
9573 Py_DECREF(str);
9574 Py_DECREF(sub);
9575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576 if (kind1 != kind)
9577 PyMem_Free(buf1);
9578 if (kind2 != kind)
9579 PyMem_Free(buf2);
9580
Guido van Rossum403d68b2000-03-13 15:55:09 +00009581 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009582}
9583
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584/* Concat to string or Unicode object giving a new Unicode object. */
9585
Alexander Belopolsky40018472011-02-26 01:02:56 +00009586PyObject *
9587PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009588{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009589 PyObject *u = NULL, *v = NULL, *w;
9590 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009591
9592 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009595 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009598 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599
9600 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009601 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009602 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009605 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009606 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608 }
9609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009611 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614 w = PyUnicode_New(
9615 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9616 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009617 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009618 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009619 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9620 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009621 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009622 v, 0,
9623 PyUnicode_GET_LENGTH(v)) < 0)
9624 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625 Py_DECREF(u);
9626 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628
Benjamin Peterson29060642009-01-31 22:14:21 +00009629 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630 Py_XDECREF(u);
9631 Py_XDECREF(v);
9632 return NULL;
9633}
9634
Walter Dörwald1ab83302007-05-18 17:15:44 +00009635void
9636PyUnicode_Append(PyObject **pleft, PyObject *right)
9637{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009638 PyObject *new;
9639 if (*pleft == NULL)
9640 return;
9641 if (right == NULL || !PyUnicode_Check(*pleft)) {
9642 Py_DECREF(*pleft);
9643 *pleft = NULL;
9644 return;
9645 }
9646 new = PyUnicode_Concat(*pleft, right);
9647 Py_DECREF(*pleft);
9648 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009649}
9650
9651void
9652PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9653{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009654 PyUnicode_Append(pleft, right);
9655 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009656}
9657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009658PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009659 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009661Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009662string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009663interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664
9665static PyObject *
9666unicode_count(PyUnicodeObject *self, PyObject *args)
9667{
9668 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009669 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009670 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 int kind1, kind2, kind;
9673 void *buf1, *buf2;
9674 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675
Jesus Ceaac451502011-04-20 17:09:23 +02009676 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9677 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009678 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 kind1 = PyUnicode_KIND(self);
9681 kind2 = PyUnicode_KIND(substring);
9682 kind = kind1 > kind2 ? kind1 : kind2;
9683 buf1 = PyUnicode_DATA(self);
9684 buf2 = PyUnicode_DATA(substring);
9685 if (kind1 != kind)
9686 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9687 if (!buf1) {
9688 Py_DECREF(substring);
9689 return NULL;
9690 }
9691 if (kind2 != kind)
9692 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9693 if (!buf2) {
9694 Py_DECREF(substring);
9695 if (kind1 != kind) PyMem_Free(buf1);
9696 return NULL;
9697 }
9698 len1 = PyUnicode_GET_LENGTH(self);
9699 len2 = PyUnicode_GET_LENGTH(substring);
9700
9701 ADJUST_INDICES(start, end, len1);
9702 switch(kind) {
9703 case PyUnicode_1BYTE_KIND:
9704 iresult = ucs1lib_count(
9705 ((Py_UCS1*)buf1) + start, end - start,
9706 buf2, len2, PY_SSIZE_T_MAX
9707 );
9708 break;
9709 case PyUnicode_2BYTE_KIND:
9710 iresult = ucs2lib_count(
9711 ((Py_UCS2*)buf1) + start, end - start,
9712 buf2, len2, PY_SSIZE_T_MAX
9713 );
9714 break;
9715 case PyUnicode_4BYTE_KIND:
9716 iresult = ucs4lib_count(
9717 ((Py_UCS4*)buf1) + start, end - start,
9718 buf2, len2, PY_SSIZE_T_MAX
9719 );
9720 break;
9721 default:
9722 assert(0); iresult = 0;
9723 }
9724
9725 result = PyLong_FromSsize_t(iresult);
9726
9727 if (kind1 != kind)
9728 PyMem_Free(buf1);
9729 if (kind2 != kind)
9730 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731
9732 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009733
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734 return result;
9735}
9736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009737PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009738 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009739\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009740Encode S using the codec registered for encoding. Default encoding\n\
9741is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009742handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009743a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9744'xmlcharrefreplace' as well as any other name registered with\n\
9745codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009746
9747static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009748unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009750 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751 char *encoding = NULL;
9752 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009753
Benjamin Peterson308d6372009-09-18 21:42:35 +00009754 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9755 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009757 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009758}
9759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009760PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009761 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762\n\
9763Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009764If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765
9766static PyObject*
9767unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9768{
9769 Py_UNICODE *e;
9770 Py_UNICODE *p;
9771 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009772 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774 PyUnicodeObject *u;
9775 int tabsize = 8;
9776
9777 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009778 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9781 return NULL;
9782
Thomas Wouters7e474022000-07-16 12:04:32 +00009783 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009784 i = 0; /* chars up to and including most recent \n or \r */
9785 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9787 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009789 if (tabsize > 0) {
9790 incr = tabsize - (j % tabsize); /* cannot overflow */
9791 if (j > PY_SSIZE_T_MAX - incr)
9792 goto overflow1;
9793 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009794 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009797 if (j > PY_SSIZE_T_MAX - 1)
9798 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799 j++;
9800 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009801 if (i > PY_SSIZE_T_MAX - j)
9802 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009804 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805 }
9806 }
9807
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009808 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009809 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009810
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811 /* Second pass: create output string and fill it */
9812 u = _PyUnicode_New(i + j);
9813 if (!u)
9814 return NULL;
9815
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009816 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 q = _PyUnicode_WSTR(u); /* next output char */
9818 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009822 if (tabsize > 0) {
9823 i = tabsize - (j % tabsize);
9824 j += i;
9825 while (i--) {
9826 if (q >= qe)
9827 goto overflow2;
9828 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009829 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009830 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009831 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009832 else {
9833 if (q >= qe)
9834 goto overflow2;
9835 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009836 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837 if (*p == '\n' || *p == '\r')
9838 j = 0;
9839 }
9840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 if (PyUnicode_READY(u) == -1) {
9842 Py_DECREF(u);
9843 return NULL;
9844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009846
9847 overflow2:
9848 Py_DECREF(u);
9849 overflow1:
9850 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9851 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852}
9853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009854PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009855 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856\n\
9857Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009858such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859arguments start and end are interpreted as in slice notation.\n\
9860\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009861Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009862
9863static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865{
Jesus Ceaac451502011-04-20 17:09:23 +02009866 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009867 Py_ssize_t start;
9868 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009869 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870
Jesus Ceaac451502011-04-20 17:09:23 +02009871 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9872 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 if (PyUnicode_READY(self) == -1)
9876 return NULL;
9877 if (PyUnicode_READY(substring) == -1)
9878 return NULL;
9879
9880 result = any_find_slice(
9881 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9882 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009883 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884
9885 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 if (result == -2)
9888 return NULL;
9889
Christian Heimes217cfd12007-12-02 14:31:20 +00009890 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891}
9892
9893static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009894unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009896 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
9897 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900}
9901
Guido van Rossumc2504932007-09-18 19:42:40 +00009902/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009903 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009904static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009905unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906{
Guido van Rossumc2504932007-09-18 19:42:40 +00009907 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009908 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 if (_PyUnicode_HASH(self) != -1)
9911 return _PyUnicode_HASH(self);
9912 if (PyUnicode_READY(self) == -1)
9913 return -1;
9914 len = PyUnicode_GET_LENGTH(self);
9915
9916 /* The hash function as a macro, gets expanded three times below. */
9917#define HASH(P) \
9918 x = (Py_uhash_t)*P << 7; \
9919 while (--len >= 0) \
9920 x = (1000003*x) ^ (Py_uhash_t)*P++;
9921
9922 switch (PyUnicode_KIND(self)) {
9923 case PyUnicode_1BYTE_KIND: {
9924 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9925 HASH(c);
9926 break;
9927 }
9928 case PyUnicode_2BYTE_KIND: {
9929 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9930 HASH(s);
9931 break;
9932 }
9933 default: {
9934 Py_UCS4 *l;
9935 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9936 "Impossible switch case in unicode_hash");
9937 l = PyUnicode_4BYTE_DATA(self);
9938 HASH(l);
9939 break;
9940 }
9941 }
9942 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9943
Guido van Rossumc2504932007-09-18 19:42:40 +00009944 if (x == -1)
9945 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009947 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009951PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009952 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009954Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955
9956static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009959 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009960 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009961 Py_ssize_t start;
9962 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963
Jesus Ceaac451502011-04-20 17:09:23 +02009964 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9965 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 if (PyUnicode_READY(self) == -1)
9969 return NULL;
9970 if (PyUnicode_READY(substring) == -1)
9971 return NULL;
9972
9973 result = any_find_slice(
9974 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9975 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009976 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977
9978 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 if (result == -2)
9981 return NULL;
9982
Guido van Rossumd57fd912000-03-10 22:53:23 +00009983 if (result < 0) {
9984 PyErr_SetString(PyExc_ValueError, "substring not found");
9985 return NULL;
9986 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009987
Christian Heimes217cfd12007-12-02 14:31:20 +00009988 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989}
9990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009991PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009992 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009994Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009995at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996
9997static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009998unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 Py_ssize_t i, length;
10001 int kind;
10002 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003 int cased;
10004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 if (PyUnicode_READY(self) == -1)
10006 return NULL;
10007 length = PyUnicode_GET_LENGTH(self);
10008 kind = PyUnicode_KIND(self);
10009 data = PyUnicode_DATA(self);
10010
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 if (length == 1)
10013 return PyBool_FromLong(
10014 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010016 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010018 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010019
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 for (i = 0; i < length; i++) {
10022 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010023
Benjamin Peterson29060642009-01-31 22:14:21 +000010024 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10025 return PyBool_FromLong(0);
10026 else if (!cased && Py_UNICODE_ISLOWER(ch))
10027 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010029 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030}
10031
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010032PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010033 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010035Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010036at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037
10038static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010039unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 Py_ssize_t i, length;
10042 int kind;
10043 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044 int cased;
10045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 if (PyUnicode_READY(self) == -1)
10047 return NULL;
10048 length = PyUnicode_GET_LENGTH(self);
10049 kind = PyUnicode_KIND(self);
10050 data = PyUnicode_DATA(self);
10051
Guido van Rossumd57fd912000-03-10 22:53:23 +000010052 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 if (length == 1)
10054 return PyBool_FromLong(
10055 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010056
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010057 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010059 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010060
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 for (i = 0; i < length; i++) {
10063 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010064
Benjamin Peterson29060642009-01-31 22:14:21 +000010065 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10066 return PyBool_FromLong(0);
10067 else if (!cased && Py_UNICODE_ISUPPER(ch))
10068 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010070 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010071}
10072
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010073PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010074 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010076Return True if S is a titlecased string and there is at least one\n\
10077character in S, i.e. upper- and titlecase characters may only\n\
10078follow uncased characters and lowercase characters only cased ones.\n\
10079Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010080
10081static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010082unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 Py_ssize_t i, length;
10085 int kind;
10086 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087 int cased, previous_is_cased;
10088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 if (PyUnicode_READY(self) == -1)
10090 return NULL;
10091 length = PyUnicode_GET_LENGTH(self);
10092 kind = PyUnicode_KIND(self);
10093 data = PyUnicode_DATA(self);
10094
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 if (length == 1) {
10097 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10098 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10099 (Py_UNICODE_ISUPPER(ch) != 0));
10100 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010101
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010102 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010104 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010105
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106 cased = 0;
10107 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 for (i = 0; i < length; i++) {
10109 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010110
Benjamin Peterson29060642009-01-31 22:14:21 +000010111 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10112 if (previous_is_cased)
10113 return PyBool_FromLong(0);
10114 previous_is_cased = 1;
10115 cased = 1;
10116 }
10117 else if (Py_UNICODE_ISLOWER(ch)) {
10118 if (!previous_is_cased)
10119 return PyBool_FromLong(0);
10120 previous_is_cased = 1;
10121 cased = 1;
10122 }
10123 else
10124 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010126 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127}
10128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010129PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010130 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010132Return True if all characters in S are whitespace\n\
10133and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134
10135static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010136unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 Py_ssize_t i, length;
10139 int kind;
10140 void *data;
10141
10142 if (PyUnicode_READY(self) == -1)
10143 return NULL;
10144 length = PyUnicode_GET_LENGTH(self);
10145 kind = PyUnicode_KIND(self);
10146 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 if (length == 1)
10150 return PyBool_FromLong(
10151 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010153 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010155 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 for (i = 0; i < length; i++) {
10158 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010159 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010160 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010162 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163}
10164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010165PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010166 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010167\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010168Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010169and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010170
10171static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010172unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 Py_ssize_t i, length;
10175 int kind;
10176 void *data;
10177
10178 if (PyUnicode_READY(self) == -1)
10179 return NULL;
10180 length = PyUnicode_GET_LENGTH(self);
10181 kind = PyUnicode_KIND(self);
10182 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010183
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010184 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 if (length == 1)
10186 return PyBool_FromLong(
10187 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010188
10189 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010191 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 for (i = 0; i < length; i++) {
10194 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010195 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010196 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010197 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010198}
10199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010200PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010201 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010202\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010203Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010204and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010205
10206static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010207unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010208{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 int kind;
10210 void *data;
10211 Py_ssize_t len, i;
10212
10213 if (PyUnicode_READY(self) == -1)
10214 return NULL;
10215
10216 kind = PyUnicode_KIND(self);
10217 data = PyUnicode_DATA(self);
10218 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010219
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010220 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 if (len == 1) {
10222 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10223 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10224 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010225
10226 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010228 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 for (i = 0; i < len; i++) {
10231 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010232 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010233 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010234 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010235 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010236}
10237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010238PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010239 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010241Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010242False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243
10244static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010245unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 Py_ssize_t i, length;
10248 int kind;
10249 void *data;
10250
10251 if (PyUnicode_READY(self) == -1)
10252 return NULL;
10253 length = PyUnicode_GET_LENGTH(self);
10254 kind = PyUnicode_KIND(self);
10255 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 if (length == 1)
10259 return PyBool_FromLong(
10260 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010262 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010264 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 for (i = 0; i < length; i++) {
10267 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010268 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010270 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010271}
10272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010273PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010274 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010276Return True if all characters in S are digits\n\
10277and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278
10279static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010280unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 Py_ssize_t i, length;
10283 int kind;
10284 void *data;
10285
10286 if (PyUnicode_READY(self) == -1)
10287 return NULL;
10288 length = PyUnicode_GET_LENGTH(self);
10289 kind = PyUnicode_KIND(self);
10290 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 if (length == 1) {
10294 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10295 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010298 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010300 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 for (i = 0; i < length; i++) {
10303 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010304 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010306 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307}
10308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010309PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010310 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010312Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010313False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314
10315static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010316unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 Py_ssize_t i, length;
10319 int kind;
10320 void *data;
10321
10322 if (PyUnicode_READY(self) == -1)
10323 return NULL;
10324 length = PyUnicode_GET_LENGTH(self);
10325 kind = PyUnicode_KIND(self);
10326 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 if (length == 1)
10330 return PyBool_FromLong(
10331 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010333 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010335 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 for (i = 0; i < length; i++) {
10338 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010339 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010341 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010342}
10343
Martin v. Löwis47383402007-08-15 07:32:56 +000010344int
10345PyUnicode_IsIdentifier(PyObject *self)
10346{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 int kind;
10348 void *data;
10349 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010350 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 if (PyUnicode_READY(self) == -1) {
10353 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010354 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 }
10356
10357 /* Special case for empty strings */
10358 if (PyUnicode_GET_LENGTH(self) == 0)
10359 return 0;
10360 kind = PyUnicode_KIND(self);
10361 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010362
10363 /* PEP 3131 says that the first character must be in
10364 XID_Start and subsequent characters in XID_Continue,
10365 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010366 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010367 letters, digits, underscore). However, given the current
10368 definition of XID_Start and XID_Continue, it is sufficient
10369 to check just for these, except that _ must be allowed
10370 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010372 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010373 return 0;
10374
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010375 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010377 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010378 return 1;
10379}
10380
10381PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010382 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010383\n\
10384Return True if S is a valid identifier according\n\
10385to the language definition.");
10386
10387static PyObject*
10388unicode_isidentifier(PyObject *self)
10389{
10390 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10391}
10392
Georg Brandl559e5d72008-06-11 18:37:52 +000010393PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010394 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010395\n\
10396Return True if all characters in S are considered\n\
10397printable in repr() or S is empty, False otherwise.");
10398
10399static PyObject*
10400unicode_isprintable(PyObject *self)
10401{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 Py_ssize_t i, length;
10403 int kind;
10404 void *data;
10405
10406 if (PyUnicode_READY(self) == -1)
10407 return NULL;
10408 length = PyUnicode_GET_LENGTH(self);
10409 kind = PyUnicode_KIND(self);
10410 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010411
10412 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 if (length == 1)
10414 return PyBool_FromLong(
10415 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 for (i = 0; i < length; i++) {
10418 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010419 Py_RETURN_FALSE;
10420 }
10421 }
10422 Py_RETURN_TRUE;
10423}
10424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010425PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010426 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010427\n\
10428Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010429iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430
10431static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010432unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010433{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010434 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435}
10436
Martin v. Löwis18e16552006-02-15 17:27:45 +000010437static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438unicode_length(PyUnicodeObject *self)
10439{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 if (PyUnicode_READY(self) == -1)
10441 return -1;
10442 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443}
10444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010445PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010446 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010448Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010449done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450
10451static PyObject *
10452unicode_ljust(PyUnicodeObject *self, PyObject *args)
10453{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010454 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 Py_UCS4 fillchar = ' ';
10456
10457 if (PyUnicode_READY(self) == -1)
10458 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010459
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010460 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010461 return NULL;
10462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010464 Py_INCREF(self);
10465 return (PyObject*) self;
10466 }
10467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010469}
10470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010471PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010472 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010474Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475
10476static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010477unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010478{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479 return fixup(self, fixlower);
10480}
10481
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010482#define LEFTSTRIP 0
10483#define RIGHTSTRIP 1
10484#define BOTHSTRIP 2
10485
10486/* Arrays indexed by above */
10487static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10488
10489#define STRIPNAME(i) (stripformat[i]+3)
10490
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010491/* externally visible for str.strip(unicode) */
10492PyObject *
10493_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 void *data;
10496 int kind;
10497 Py_ssize_t i, j, len;
10498 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10501 return NULL;
10502
10503 kind = PyUnicode_KIND(self);
10504 data = PyUnicode_DATA(self);
10505 len = PyUnicode_GET_LENGTH(self);
10506 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10507 PyUnicode_DATA(sepobj),
10508 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010509
Benjamin Peterson14339b62009-01-31 16:36:08 +000010510 i = 0;
10511 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 while (i < len &&
10513 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010514 i++;
10515 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010516 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010517
Benjamin Peterson14339b62009-01-31 16:36:08 +000010518 j = len;
10519 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010520 do {
10521 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 } while (j >= i &&
10523 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010524 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010525 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010526
Victor Stinner12bab6d2011-10-01 01:53:49 +020010527 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528}
10529
10530PyObject*
10531PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10532{
10533 unsigned char *data;
10534 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010535 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536
Victor Stinnerde636f32011-10-01 03:55:54 +020010537 if (PyUnicode_READY(self) == -1)
10538 return NULL;
10539
10540 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10541
Victor Stinner12bab6d2011-10-01 01:53:49 +020010542 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010544 if (PyUnicode_CheckExact(self)) {
10545 Py_INCREF(self);
10546 return self;
10547 }
10548 else
10549 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 }
10551
Victor Stinner12bab6d2011-10-01 01:53:49 +020010552 length = end - start;
10553 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010554 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555
Victor Stinnerde636f32011-10-01 03:55:54 +020010556 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010557 PyErr_SetString(PyExc_IndexError, "string index out of range");
10558 return NULL;
10559 }
10560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 kind = PyUnicode_KIND(self);
10562 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010563 return PyUnicode_FromKindAndData(kind,
10564 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010565 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567
10568static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010569do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 int kind;
10572 void *data;
10573 Py_ssize_t len, i, j;
10574
10575 if (PyUnicode_READY(self) == -1)
10576 return NULL;
10577
10578 kind = PyUnicode_KIND(self);
10579 data = PyUnicode_DATA(self);
10580 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010581
Benjamin Peterson14339b62009-01-31 16:36:08 +000010582 i = 0;
10583 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010585 i++;
10586 }
10587 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010588
Benjamin Peterson14339b62009-01-31 16:36:08 +000010589 j = len;
10590 if (striptype != LEFTSTRIP) {
10591 do {
10592 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010594 j++;
10595 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010596
Victor Stinner12bab6d2011-10-01 01:53:49 +020010597 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598}
10599
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010600
10601static PyObject *
10602do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10603{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010604 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010605
Benjamin Peterson14339b62009-01-31 16:36:08 +000010606 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10607 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010608
Benjamin Peterson14339b62009-01-31 16:36:08 +000010609 if (sep != NULL && sep != Py_None) {
10610 if (PyUnicode_Check(sep))
10611 return _PyUnicode_XStrip(self, striptype, sep);
10612 else {
10613 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010614 "%s arg must be None or str",
10615 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010616 return NULL;
10617 }
10618 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010619
Benjamin Peterson14339b62009-01-31 16:36:08 +000010620 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010621}
10622
10623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010624PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010625 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010626\n\
10627Return a copy of the string S with leading and trailing\n\
10628whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010629If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010630
10631static PyObject *
10632unicode_strip(PyUnicodeObject *self, PyObject *args)
10633{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010634 if (PyTuple_GET_SIZE(args) == 0)
10635 return do_strip(self, BOTHSTRIP); /* Common case */
10636 else
10637 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010638}
10639
10640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010641PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010642 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010643\n\
10644Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010645If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010646
10647static PyObject *
10648unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10649{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010650 if (PyTuple_GET_SIZE(args) == 0)
10651 return do_strip(self, LEFTSTRIP); /* Common case */
10652 else
10653 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010654}
10655
10656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010657PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010658 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010659\n\
10660Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010661If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010662
10663static PyObject *
10664unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10665{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010666 if (PyTuple_GET_SIZE(args) == 0)
10667 return do_strip(self, RIGHTSTRIP); /* Common case */
10668 else
10669 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010670}
10671
10672
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010674unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010675{
10676 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678
Georg Brandl222de0f2009-04-12 12:01:50 +000010679 if (len < 1) {
10680 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020010681 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000010682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683
Tim Peters7a29bd52001-09-12 03:03:31 +000010684 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010685 /* no repeat, return original string */
10686 Py_INCREF(str);
10687 return (PyObject*) str;
10688 }
Tim Peters8f422462000-09-09 06:13:41 +000010689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (PyUnicode_READY(str) == -1)
10691 return NULL;
10692
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010693 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010694 PyErr_SetString(PyExc_OverflowError,
10695 "repeated string is too long");
10696 return NULL;
10697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701 if (!u)
10702 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010703 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 if (PyUnicode_GET_LENGTH(str) == 1) {
10706 const int kind = PyUnicode_KIND(str);
10707 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10708 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010709 if (kind == PyUnicode_1BYTE_KIND)
10710 memset(to, (unsigned char)fill_char, len);
10711 else {
10712 for (n = 0; n < len; ++n)
10713 PyUnicode_WRITE(kind, to, n, fill_char);
10714 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 }
10716 else {
10717 /* number of characters copied this far */
10718 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10719 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10720 char *to = (char *) PyUnicode_DATA(u);
10721 Py_MEMCPY(to, PyUnicode_DATA(str),
10722 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010723 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 n = (done <= nchars-done) ? done : nchars-done;
10725 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010726 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010727 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728 }
10729
10730 return (PyObject*) u;
10731}
10732
Alexander Belopolsky40018472011-02-26 01:02:56 +000010733PyObject *
10734PyUnicode_Replace(PyObject *obj,
10735 PyObject *subobj,
10736 PyObject *replobj,
10737 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738{
10739 PyObject *self;
10740 PyObject *str1;
10741 PyObject *str2;
10742 PyObject *result;
10743
10744 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010745 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010748 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010749 Py_DECREF(self);
10750 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751 }
10752 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010753 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010754 Py_DECREF(self);
10755 Py_DECREF(str1);
10756 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759 Py_DECREF(self);
10760 Py_DECREF(str1);
10761 Py_DECREF(str2);
10762 return result;
10763}
10764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010765PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010766 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767\n\
10768Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010769old replaced by new. If the optional argument count is\n\
10770given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771
10772static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 PyObject *str1;
10776 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010777 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778 PyObject *result;
10779
Martin v. Löwis18e16552006-02-15 17:27:45 +000010780 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010783 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 str1 = PyUnicode_FromObject(str1);
10785 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10786 return NULL;
10787 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020010788 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010789 Py_DECREF(str1);
10790 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010792
10793 result = replace(self, str1, str2, maxcount);
10794
10795 Py_DECREF(str1);
10796 Py_DECREF(str2);
10797 return result;
10798}
10799
Alexander Belopolsky40018472011-02-26 01:02:56 +000010800static PyObject *
10801unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010803 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 Py_ssize_t isize;
10805 Py_ssize_t osize, squote, dquote, i, o;
10806 Py_UCS4 max, quote;
10807 int ikind, okind;
10808 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010811 return NULL;
10812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 isize = PyUnicode_GET_LENGTH(unicode);
10814 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 /* Compute length of output, quote characters, and
10817 maximum character */
10818 osize = 2; /* quotes */
10819 max = 127;
10820 squote = dquote = 0;
10821 ikind = PyUnicode_KIND(unicode);
10822 for (i = 0; i < isize; i++) {
10823 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10824 switch (ch) {
10825 case '\'': squote++; osize++; break;
10826 case '"': dquote++; osize++; break;
10827 case '\\': case '\t': case '\r': case '\n':
10828 osize += 2; break;
10829 default:
10830 /* Fast-path ASCII */
10831 if (ch < ' ' || ch == 0x7f)
10832 osize += 4; /* \xHH */
10833 else if (ch < 0x7f)
10834 osize++;
10835 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10836 osize++;
10837 max = ch > max ? ch : max;
10838 }
10839 else if (ch < 0x100)
10840 osize += 4; /* \xHH */
10841 else if (ch < 0x10000)
10842 osize += 6; /* \uHHHH */
10843 else
10844 osize += 10; /* \uHHHHHHHH */
10845 }
10846 }
10847
10848 quote = '\'';
10849 if (squote) {
10850 if (dquote)
10851 /* Both squote and dquote present. Use squote,
10852 and escape them */
10853 osize += squote;
10854 else
10855 quote = '"';
10856 }
10857
10858 repr = PyUnicode_New(osize, max);
10859 if (repr == NULL)
10860 return NULL;
10861 okind = PyUnicode_KIND(repr);
10862 odata = PyUnicode_DATA(repr);
10863
10864 PyUnicode_WRITE(okind, odata, 0, quote);
10865 PyUnicode_WRITE(okind, odata, osize-1, quote);
10866
10867 for (i = 0, o = 1; i < isize; i++) {
10868 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010869
10870 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010871 if ((ch == quote) || (ch == '\\')) {
10872 PyUnicode_WRITE(okind, odata, o++, '\\');
10873 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010874 continue;
10875 }
10876
Benjamin Peterson29060642009-01-31 22:14:21 +000010877 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010878 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879 PyUnicode_WRITE(okind, odata, o++, '\\');
10880 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010881 }
10882 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 PyUnicode_WRITE(okind, odata, o++, '\\');
10884 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010885 }
10886 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010887 PyUnicode_WRITE(okind, odata, o++, '\\');
10888 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010889 }
10890
10891 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010892 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 PyUnicode_WRITE(okind, odata, o++, '\\');
10894 PyUnicode_WRITE(okind, odata, o++, 'x');
10895 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10896 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010897 }
10898
Georg Brandl559e5d72008-06-11 18:37:52 +000010899 /* Copy ASCII characters as-is */
10900 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010901 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010902 }
10903
Benjamin Peterson29060642009-01-31 22:14:21 +000010904 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010905 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010906 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010907 (categories Z* and C* except ASCII space)
10908 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010910 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010911 if (ch <= 0xff) {
10912 PyUnicode_WRITE(okind, odata, o++, '\\');
10913 PyUnicode_WRITE(okind, odata, o++, 'x');
10914 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10915 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010916 }
10917 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918 else if (ch >= 0x10000) {
10919 PyUnicode_WRITE(okind, odata, o++, '\\');
10920 PyUnicode_WRITE(okind, odata, o++, 'U');
10921 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10922 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10923 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10924 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10925 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10926 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10927 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10928 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010929 }
10930 /* Map 16-bit characters to '\uxxxx' */
10931 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 PyUnicode_WRITE(okind, odata, o++, '\\');
10933 PyUnicode_WRITE(okind, odata, o++, 'u');
10934 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10935 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10936 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10937 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010938 }
10939 }
10940 /* Copy characters as-is */
10941 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010943 }
10944 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010947 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948}
10949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010950PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010951 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952\n\
10953Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010954such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955arguments start and end are interpreted as in slice notation.\n\
10956\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010957Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
10959static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961{
Jesus Ceaac451502011-04-20 17:09:23 +020010962 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010963 Py_ssize_t start;
10964 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010965 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
Jesus Ceaac451502011-04-20 17:09:23 +020010967 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10968 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010969 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 if (PyUnicode_READY(self) == -1)
10972 return NULL;
10973 if (PyUnicode_READY(substring) == -1)
10974 return NULL;
10975
10976 result = any_find_slice(
10977 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10978 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010979 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980
10981 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010983 if (result == -2)
10984 return NULL;
10985
Christian Heimes217cfd12007-12-02 14:31:20 +000010986 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987}
10988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010989PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010990 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010992Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993
10994static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996{
Jesus Ceaac451502011-04-20 17:09:23 +020010997 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010998 Py_ssize_t start;
10999 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011000 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001
Jesus Ceaac451502011-04-20 17:09:23 +020011002 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11003 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011004 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 if (PyUnicode_READY(self) == -1)
11007 return NULL;
11008 if (PyUnicode_READY(substring) == -1)
11009 return NULL;
11010
11011 result = any_find_slice(
11012 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11013 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011014 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015
11016 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 if (result == -2)
11019 return NULL;
11020
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 if (result < 0) {
11022 PyErr_SetString(PyExc_ValueError, "substring not found");
11023 return NULL;
11024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025
Christian Heimes217cfd12007-12-02 14:31:20 +000011026 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027}
11028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011029PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011030 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011032Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011033done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034
11035static PyObject *
11036unicode_rjust(PyUnicodeObject *self, PyObject *args)
11037{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011038 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 Py_UCS4 fillchar = ' ';
11040
Victor Stinnere9a29352011-10-01 02:14:59 +020011041 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011043
Victor Stinnere9a29352011-10-01 02:14:59 +020011044 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045 return NULL;
11046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048 Py_INCREF(self);
11049 return (PyObject*) self;
11050 }
11051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053}
11054
Alexander Belopolsky40018472011-02-26 01:02:56 +000011055PyObject *
11056PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057{
11058 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011059
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060 s = PyUnicode_FromObject(s);
11061 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011062 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011063 if (sep != NULL) {
11064 sep = PyUnicode_FromObject(sep);
11065 if (sep == NULL) {
11066 Py_DECREF(s);
11067 return NULL;
11068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069 }
11070
11071 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11072
11073 Py_DECREF(s);
11074 Py_XDECREF(sep);
11075 return result;
11076}
11077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011078PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011079 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080\n\
11081Return a list of the words in S, using sep as the\n\
11082delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011083splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011084whitespace string is a separator and empty strings are\n\
11085removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011086
11087static PyObject*
11088unicode_split(PyUnicodeObject *self, PyObject *args)
11089{
11090 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011091 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092
Martin v. Löwis18e16552006-02-15 17:27:45 +000011093 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094 return NULL;
11095
11096 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011097 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011099 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011101 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011102}
11103
Thomas Wouters477c8d52006-05-27 19:21:47 +000011104PyObject *
11105PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11106{
11107 PyObject* str_obj;
11108 PyObject* sep_obj;
11109 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 int kind1, kind2, kind;
11111 void *buf1 = NULL, *buf2 = NULL;
11112 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011113
11114 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011115 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011116 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011117 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011119 Py_DECREF(str_obj);
11120 return NULL;
11121 }
11122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 kind1 = PyUnicode_KIND(str_in);
11124 kind2 = PyUnicode_KIND(sep_obj);
11125 kind = kind1 > kind2 ? kind1 : kind2;
11126 buf1 = PyUnicode_DATA(str_in);
11127 if (kind1 != kind)
11128 buf1 = _PyUnicode_AsKind(str_in, kind);
11129 if (!buf1)
11130 goto onError;
11131 buf2 = PyUnicode_DATA(sep_obj);
11132 if (kind2 != kind)
11133 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11134 if (!buf2)
11135 goto onError;
11136 len1 = PyUnicode_GET_LENGTH(str_obj);
11137 len2 = PyUnicode_GET_LENGTH(sep_obj);
11138
11139 switch(PyUnicode_KIND(str_in)) {
11140 case PyUnicode_1BYTE_KIND:
11141 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11142 break;
11143 case PyUnicode_2BYTE_KIND:
11144 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11145 break;
11146 case PyUnicode_4BYTE_KIND:
11147 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11148 break;
11149 default:
11150 assert(0);
11151 out = 0;
11152 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011153
11154 Py_DECREF(sep_obj);
11155 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 if (kind1 != kind)
11157 PyMem_Free(buf1);
11158 if (kind2 != kind)
11159 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011160
11161 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 onError:
11163 Py_DECREF(sep_obj);
11164 Py_DECREF(str_obj);
11165 if (kind1 != kind && buf1)
11166 PyMem_Free(buf1);
11167 if (kind2 != kind && buf2)
11168 PyMem_Free(buf2);
11169 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011170}
11171
11172
11173PyObject *
11174PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11175{
11176 PyObject* str_obj;
11177 PyObject* sep_obj;
11178 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 int kind1, kind2, kind;
11180 void *buf1 = NULL, *buf2 = NULL;
11181 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011182
11183 str_obj = PyUnicode_FromObject(str_in);
11184 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011186 sep_obj = PyUnicode_FromObject(sep_in);
11187 if (!sep_obj) {
11188 Py_DECREF(str_obj);
11189 return NULL;
11190 }
11191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 kind1 = PyUnicode_KIND(str_in);
11193 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011194 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 buf1 = PyUnicode_DATA(str_in);
11196 if (kind1 != kind)
11197 buf1 = _PyUnicode_AsKind(str_in, kind);
11198 if (!buf1)
11199 goto onError;
11200 buf2 = PyUnicode_DATA(sep_obj);
11201 if (kind2 != kind)
11202 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11203 if (!buf2)
11204 goto onError;
11205 len1 = PyUnicode_GET_LENGTH(str_obj);
11206 len2 = PyUnicode_GET_LENGTH(sep_obj);
11207
11208 switch(PyUnicode_KIND(str_in)) {
11209 case PyUnicode_1BYTE_KIND:
11210 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11211 break;
11212 case PyUnicode_2BYTE_KIND:
11213 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11214 break;
11215 case PyUnicode_4BYTE_KIND:
11216 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11217 break;
11218 default:
11219 assert(0);
11220 out = 0;
11221 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011222
11223 Py_DECREF(sep_obj);
11224 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 if (kind1 != kind)
11226 PyMem_Free(buf1);
11227 if (kind2 != kind)
11228 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011229
11230 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 onError:
11232 Py_DECREF(sep_obj);
11233 Py_DECREF(str_obj);
11234 if (kind1 != kind && buf1)
11235 PyMem_Free(buf1);
11236 if (kind2 != kind && buf2)
11237 PyMem_Free(buf2);
11238 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011239}
11240
11241PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011242 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011243\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011244Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011245the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011246found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011247
11248static PyObject*
11249unicode_partition(PyUnicodeObject *self, PyObject *separator)
11250{
11251 return PyUnicode_Partition((PyObject *)self, separator);
11252}
11253
11254PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011255 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011256\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011257Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011258the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011259separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011260
11261static PyObject*
11262unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11263{
11264 return PyUnicode_RPartition((PyObject *)self, separator);
11265}
11266
Alexander Belopolsky40018472011-02-26 01:02:56 +000011267PyObject *
11268PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011269{
11270 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011271
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011272 s = PyUnicode_FromObject(s);
11273 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011274 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011275 if (sep != NULL) {
11276 sep = PyUnicode_FromObject(sep);
11277 if (sep == NULL) {
11278 Py_DECREF(s);
11279 return NULL;
11280 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011281 }
11282
11283 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11284
11285 Py_DECREF(s);
11286 Py_XDECREF(sep);
11287 return result;
11288}
11289
11290PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011291 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011292\n\
11293Return a list of the words in S, using sep as the\n\
11294delimiter string, starting at the end of the string and\n\
11295working to the front. If maxsplit is given, at most maxsplit\n\
11296splits are done. If sep is not specified, any whitespace string\n\
11297is a separator.");
11298
11299static PyObject*
11300unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11301{
11302 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011303 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011304
Martin v. Löwis18e16552006-02-15 17:27:45 +000011305 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011306 return NULL;
11307
11308 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011309 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011310 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011311 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011312 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011313 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011314}
11315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011316PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011317 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318\n\
11319Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011320Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011321is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
11323static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011324unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011326 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011327 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011329 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11330 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331 return NULL;
11332
Guido van Rossum86662912000-04-11 15:38:46 +000011333 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334}
11335
11336static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011337PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338{
Walter Dörwald346737f2007-05-31 10:44:43 +000011339 if (PyUnicode_CheckExact(self)) {
11340 Py_INCREF(self);
11341 return self;
11342 } else
11343 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011344 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345}
11346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011347PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011348 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349\n\
11350Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011351and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352
11353static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011354unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356 return fixup(self, fixswapcase);
11357}
11358
Georg Brandlceee0772007-11-27 23:48:05 +000011359PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011360 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011361\n\
11362Return a translation table usable for str.translate().\n\
11363If there is only one argument, it must be a dictionary mapping Unicode\n\
11364ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011365Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011366If there are two arguments, they must be strings of equal length, and\n\
11367in the resulting dictionary, each character in x will be mapped to the\n\
11368character at the same position in y. If there is a third argument, it\n\
11369must be a string, whose characters will be mapped to None in the result.");
11370
11371static PyObject*
11372unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11373{
11374 PyObject *x, *y = NULL, *z = NULL;
11375 PyObject *new = NULL, *key, *value;
11376 Py_ssize_t i = 0;
11377 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011378
Georg Brandlceee0772007-11-27 23:48:05 +000011379 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11380 return NULL;
11381 new = PyDict_New();
11382 if (!new)
11383 return NULL;
11384 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011385 int x_kind, y_kind, z_kind;
11386 void *x_data, *y_data, *z_data;
11387
Georg Brandlceee0772007-11-27 23:48:05 +000011388 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011389 if (!PyUnicode_Check(x)) {
11390 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11391 "be a string if there is a second argument");
11392 goto err;
11393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011395 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11396 "arguments must have equal length");
11397 goto err;
11398 }
11399 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 x_kind = PyUnicode_KIND(x);
11401 y_kind = PyUnicode_KIND(y);
11402 x_data = PyUnicode_DATA(x);
11403 y_data = PyUnicode_DATA(y);
11404 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11405 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11406 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011407 if (!key || !value)
11408 goto err;
11409 res = PyDict_SetItem(new, key, value);
11410 Py_DECREF(key);
11411 Py_DECREF(value);
11412 if (res < 0)
11413 goto err;
11414 }
11415 /* create entries for deleting chars in z */
11416 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 z_kind = PyUnicode_KIND(z);
11418 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011419 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011421 if (!key)
11422 goto err;
11423 res = PyDict_SetItem(new, key, Py_None);
11424 Py_DECREF(key);
11425 if (res < 0)
11426 goto err;
11427 }
11428 }
11429 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 int kind;
11431 void *data;
11432
Georg Brandlceee0772007-11-27 23:48:05 +000011433 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011434 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011435 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11436 "to maketrans it must be a dict");
11437 goto err;
11438 }
11439 /* copy entries into the new dict, converting string keys to int keys */
11440 while (PyDict_Next(x, &i, &key, &value)) {
11441 if (PyUnicode_Check(key)) {
11442 /* convert string keys to integer keys */
11443 PyObject *newkey;
11444 if (PyUnicode_GET_SIZE(key) != 1) {
11445 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11446 "table must be of length 1");
11447 goto err;
11448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 kind = PyUnicode_KIND(key);
11450 data = PyUnicode_DATA(key);
11451 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011452 if (!newkey)
11453 goto err;
11454 res = PyDict_SetItem(new, newkey, value);
11455 Py_DECREF(newkey);
11456 if (res < 0)
11457 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011458 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011459 /* just keep integer keys */
11460 if (PyDict_SetItem(new, key, value) < 0)
11461 goto err;
11462 } else {
11463 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11464 "be strings or integers");
11465 goto err;
11466 }
11467 }
11468 }
11469 return new;
11470 err:
11471 Py_DECREF(new);
11472 return NULL;
11473}
11474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011475PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011476 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477\n\
11478Return a copy of the string S, where all characters have been mapped\n\
11479through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011480Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011481Unmapped characters are left untouched. Characters mapped to None\n\
11482are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
11484static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488}
11489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011490PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011493Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494
11495static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011496unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498 return fixup(self, fixupper);
11499}
11500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011501PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011502 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011504Pad a numeric string S with zeros on the left, to fill a field\n\
11505of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
11507static PyObject *
11508unicode_zfill(PyUnicodeObject *self, PyObject *args)
11509{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011510 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011512 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 int kind;
11514 void *data;
11515 Py_UCS4 chr;
11516
11517 if (PyUnicode_READY(self) == -1)
11518 return NULL;
11519
Martin v. Löwis18e16552006-02-15 17:27:45 +000011520 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521 return NULL;
11522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011524 if (PyUnicode_CheckExact(self)) {
11525 Py_INCREF(self);
11526 return (PyObject*) self;
11527 }
11528 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011529 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530 }
11531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533
11534 u = pad(self, fill, 0, '0');
11535
Walter Dörwald068325e2002-04-15 13:36:47 +000011536 if (u == NULL)
11537 return NULL;
11538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 kind = PyUnicode_KIND(u);
11540 data = PyUnicode_DATA(u);
11541 chr = PyUnicode_READ(kind, data, fill);
11542
11543 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545 PyUnicode_WRITE(kind, data, 0, chr);
11546 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547 }
11548
11549 return (PyObject*) u;
11550}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551
11552#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011553static PyObject *
11554unicode__decimal2ascii(PyObject *self)
11555{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011557}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558#endif
11559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011560PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011563Return True if S starts with the specified prefix, False otherwise.\n\
11564With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011565With optional end, stop comparing S at that position.\n\
11566prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567
11568static PyObject *
11569unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011570 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011572 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011574 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011575 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011576 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
Jesus Ceaac451502011-04-20 17:09:23 +020011578 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011579 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011580 if (PyTuple_Check(subobj)) {
11581 Py_ssize_t i;
11582 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11583 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011584 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011585 if (substring == NULL)
11586 return NULL;
11587 result = tailmatch(self, substring, start, end, -1);
11588 Py_DECREF(substring);
11589 if (result) {
11590 Py_RETURN_TRUE;
11591 }
11592 }
11593 /* nothing matched */
11594 Py_RETURN_FALSE;
11595 }
11596 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011597 if (substring == NULL) {
11598 if (PyErr_ExceptionMatches(PyExc_TypeError))
11599 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11600 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011602 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011603 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011605 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606}
11607
11608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011609PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011610 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011612Return True if S ends with the specified suffix, False otherwise.\n\
11613With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011614With optional end, stop comparing S at that position.\n\
11615suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616
11617static PyObject *
11618unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011621 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011623 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011624 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011625 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626
Jesus Ceaac451502011-04-20 17:09:23 +020011627 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011628 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011629 if (PyTuple_Check(subobj)) {
11630 Py_ssize_t i;
11631 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11632 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011634 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011635 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011636 result = tailmatch(self, substring, start, end, +1);
11637 Py_DECREF(substring);
11638 if (result) {
11639 Py_RETURN_TRUE;
11640 }
11641 }
11642 Py_RETURN_FALSE;
11643 }
11644 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011645 if (substring == NULL) {
11646 if (PyErr_ExceptionMatches(PyExc_TypeError))
11647 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11648 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011650 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011651 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011653 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654}
11655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011657
11658PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011659 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011660\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011661Return a formatted version of S, using substitutions from args and kwargs.\n\
11662The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011663
Eric Smith27bbca62010-11-04 17:06:58 +000011664PyDoc_STRVAR(format_map__doc__,
11665 "S.format_map(mapping) -> str\n\
11666\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011667Return a formatted version of S, using substitutions from mapping.\n\
11668The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011669
Eric Smith4a7d76d2008-05-30 18:10:19 +000011670static PyObject *
11671unicode__format__(PyObject* self, PyObject* args)
11672{
11673 PyObject *format_spec;
11674
11675 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11676 return NULL;
11677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11679 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011680}
11681
Eric Smith8c663262007-08-25 02:26:07 +000011682PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011683 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011684\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011685Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011686
11687static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011688unicode__sizeof__(PyUnicodeObject *v)
11689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 Py_ssize_t size;
11691
11692 /* If it's a compact object, account for base structure +
11693 character data. */
11694 if (PyUnicode_IS_COMPACT_ASCII(v))
11695 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11696 else if (PyUnicode_IS_COMPACT(v))
11697 size = sizeof(PyCompactUnicodeObject) +
11698 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11699 else {
11700 /* If it is a two-block object, account for base object, and
11701 for character block if present. */
11702 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020011703 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 size += (PyUnicode_GET_LENGTH(v) + 1) *
11705 PyUnicode_CHARACTER_SIZE(v);
11706 }
11707 /* If the wstr pointer is present, account for it unless it is shared
11708 with the data pointer. Since PyUnicode_DATA will crash if the object
11709 is not ready, check whether it's either not ready (in which case the
11710 data is entirely in wstr) or if the data is not shared. */
11711 if (_PyUnicode_WSTR(v) &&
11712 (!PyUnicode_IS_READY(v) ||
11713 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11714 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011715 if (!PyUnicode_IS_COMPACT_ASCII(v)
11716 && _PyUnicode_UTF8(v)
11717 && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11718 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719
11720 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011721}
11722
11723PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011725
11726static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011727unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011728{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011729 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 if (!copy)
11731 return NULL;
11732 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011733}
11734
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735static PyMethodDef unicode_methods[] = {
11736
11737 /* Order is according to common usage: often used methods should
11738 appear first, since lookup is done sequentially. */
11739
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011740 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011741 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11742 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011743 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011744 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11745 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11746 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11747 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11748 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11749 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11750 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011751 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011752 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11753 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11754 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011755 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011756 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11757 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11758 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011759 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011760 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011761 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011762 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011763 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11764 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11765 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11766 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11767 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11768 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11769 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11770 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11771 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11772 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11773 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11774 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11775 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11776 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011777 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011778 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011779 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011780 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011781 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011782 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011783 {"maketrans", (PyCFunction) unicode_maketrans,
11784 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011785 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011786#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011787 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788#endif
11789
11790#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011791 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011792 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793#endif
11794
Benjamin Peterson14339b62009-01-31 16:36:08 +000011795 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796 {NULL, NULL}
11797};
11798
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011799static PyObject *
11800unicode_mod(PyObject *v, PyObject *w)
11801{
Brian Curtindfc80e32011-08-10 20:28:54 -050011802 if (!PyUnicode_Check(v))
11803 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011804 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011805}
11806
11807static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011808 0, /*nb_add*/
11809 0, /*nb_subtract*/
11810 0, /*nb_multiply*/
11811 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011812};
11813
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011815 (lenfunc) unicode_length, /* sq_length */
11816 PyUnicode_Concat, /* sq_concat */
11817 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11818 (ssizeargfunc) unicode_getitem, /* sq_item */
11819 0, /* sq_slice */
11820 0, /* sq_ass_item */
11821 0, /* sq_ass_slice */
11822 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823};
11824
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011825static PyObject*
11826unicode_subscript(PyUnicodeObject* self, PyObject* item)
11827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 if (PyUnicode_READY(self) == -1)
11829 return NULL;
11830
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011831 if (PyIndex_Check(item)) {
11832 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011833 if (i == -1 && PyErr_Occurred())
11834 return NULL;
11835 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011837 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011838 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011839 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011841 Py_UNICODE* result_buf;
11842 PyObject* result;
11843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011846 return NULL;
11847 }
11848
11849 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 return PyUnicode_New(0, 0);
11851 } else if (start == 0 && step == 1 &&
11852 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011853 PyUnicode_CheckExact(self)) {
11854 Py_INCREF(self);
11855 return (PyObject *)self;
11856 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011857 return PyUnicode_Substring((PyObject*)self,
11858 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011859 } else {
11860 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011861 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11862 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011863
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 if (result_buf == NULL)
11865 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011866
11867 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11868 result_buf[i] = source_buf[cur];
11869 }
Tim Petersced69f82003-09-16 20:30:58 +000011870
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011871 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011872 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011873 return result;
11874 }
11875 } else {
11876 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11877 return NULL;
11878 }
11879}
11880
11881static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011882 (lenfunc)unicode_length, /* mp_length */
11883 (binaryfunc)unicode_subscript, /* mp_subscript */
11884 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011885};
11886
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888/* Helpers for PyUnicode_Format() */
11889
11890static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011891getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011893 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011895 (*p_argidx)++;
11896 if (arglen < 0)
11897 return args;
11898 else
11899 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900 }
11901 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903 return NULL;
11904}
11905
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011906/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011908static PyObject *
11909formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011911 char *p;
11912 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011914
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915 x = PyFloat_AsDouble(v);
11916 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011917 return NULL;
11918
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011920 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011921
Eric Smith0923d1d2009-04-16 20:16:10 +000011922 p = PyOS_double_to_string(x, type, prec,
11923 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011924 if (p == NULL)
11925 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011927 PyMem_Free(p);
11928 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929}
11930
Tim Peters38fd5b62000-09-21 05:43:11 +000011931static PyObject*
11932formatlong(PyObject *val, int flags, int prec, int type)
11933{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011934 char *buf;
11935 int len;
11936 PyObject *str; /* temporary string object. */
11937 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011938
Benjamin Peterson14339b62009-01-31 16:36:08 +000011939 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11940 if (!str)
11941 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011943 Py_DECREF(str);
11944 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011945}
11946
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011949 size_t buflen,
11950 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011952 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011953 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 if (PyUnicode_GET_LENGTH(v) == 1) {
11955 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 buf[1] = '\0';
11957 return 1;
11958 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 goto onError;
11960 }
11961 else {
11962 /* Integer input truncated to a character */
11963 long x;
11964 x = PyLong_AsLong(v);
11965 if (x == -1 && PyErr_Occurred())
11966 goto onError;
11967
11968 if (x < 0 || x > 0x10ffff) {
11969 PyErr_SetString(PyExc_OverflowError,
11970 "%c arg not in range(0x110000)");
11971 return -1;
11972 }
11973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011975 buf[1] = '\0';
11976 return 1;
11977 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011978
Benjamin Peterson29060642009-01-31 22:14:21 +000011979 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011980 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011982 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983}
11984
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011985/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011986 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011987*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011988#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011989
Alexander Belopolsky40018472011-02-26 01:02:56 +000011990PyObject *
11991PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 void *fmt;
11994 int fmtkind;
11995 PyObject *result;
11996 Py_UCS4 *res, *res0;
11997 Py_UCS4 max;
11998 int kind;
11999 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012003
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 PyErr_BadInternalCall();
12006 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12009 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012010 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 fmt = PyUnicode_DATA(uformat);
12012 fmtkind = PyUnicode_KIND(uformat);
12013 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12014 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015
12016 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12018 if (res0 == NULL) {
12019 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022
12023 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012024 arglen = PyTuple_Size(args);
12025 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026 }
12027 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012028 arglen = -1;
12029 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012031 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012032 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012033 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034
12035 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012037 if (--rescnt < 0) {
12038 rescnt = fmtcnt + 100;
12039 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12041 if (res0 == NULL){
12042 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012043 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 }
12045 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012046 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012049 }
12050 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012051 /* Got a format specifier */
12052 int flags = 0;
12053 Py_ssize_t width = -1;
12054 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 Py_UCS4 c = '\0';
12056 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012057 int isnumok;
12058 PyObject *v = NULL;
12059 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 void *pbuf;
12061 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012062 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 Py_ssize_t len, len1;
12064 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 fmtpos++;
12067 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12068 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 Py_ssize_t keylen;
12070 PyObject *key;
12071 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012072
Benjamin Peterson29060642009-01-31 22:14:21 +000012073 if (dict == NULL) {
12074 PyErr_SetString(PyExc_TypeError,
12075 "format requires a mapping");
12076 goto onError;
12077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012079 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012081 /* Skip over balanced parentheses */
12082 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012084 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012086 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012090 if (fmtcnt < 0 || pcount > 0) {
12091 PyErr_SetString(PyExc_ValueError,
12092 "incomplete format key");
12093 goto onError;
12094 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012095 key = PyUnicode_Substring((PyObject*)uformat,
12096 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012097 if (key == NULL)
12098 goto onError;
12099 if (args_owned) {
12100 Py_DECREF(args);
12101 args_owned = 0;
12102 }
12103 args = PyObject_GetItem(dict, key);
12104 Py_DECREF(key);
12105 if (args == NULL) {
12106 goto onError;
12107 }
12108 args_owned = 1;
12109 arglen = -1;
12110 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012111 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 case '-': flags |= F_LJUST; continue;
12115 case '+': flags |= F_SIGN; continue;
12116 case ' ': flags |= F_BLANK; continue;
12117 case '#': flags |= F_ALT; continue;
12118 case '0': flags |= F_ZERO; continue;
12119 }
12120 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012121 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012122 if (c == '*') {
12123 v = getnextarg(args, arglen, &argidx);
12124 if (v == NULL)
12125 goto onError;
12126 if (!PyLong_Check(v)) {
12127 PyErr_SetString(PyExc_TypeError,
12128 "* wants int");
12129 goto onError;
12130 }
12131 width = PyLong_AsLong(v);
12132 if (width == -1 && PyErr_Occurred())
12133 goto onError;
12134 if (width < 0) {
12135 flags |= F_LJUST;
12136 width = -width;
12137 }
12138 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 }
12141 else if (c >= '0' && c <= '9') {
12142 width = c - '0';
12143 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 if (c < '0' || c > '9')
12146 break;
12147 if ((width*10) / 10 != width) {
12148 PyErr_SetString(PyExc_ValueError,
12149 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012150 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012151 }
12152 width = width*10 + (c - '0');
12153 }
12154 }
12155 if (c == '.') {
12156 prec = 0;
12157 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012159 if (c == '*') {
12160 v = getnextarg(args, arglen, &argidx);
12161 if (v == NULL)
12162 goto onError;
12163 if (!PyLong_Check(v)) {
12164 PyErr_SetString(PyExc_TypeError,
12165 "* wants int");
12166 goto onError;
12167 }
12168 prec = PyLong_AsLong(v);
12169 if (prec == -1 && PyErr_Occurred())
12170 goto onError;
12171 if (prec < 0)
12172 prec = 0;
12173 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012175 }
12176 else if (c >= '0' && c <= '9') {
12177 prec = c - '0';
12178 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012180 if (c < '0' || c > '9')
12181 break;
12182 if ((prec*10) / 10 != prec) {
12183 PyErr_SetString(PyExc_ValueError,
12184 "prec too big");
12185 goto onError;
12186 }
12187 prec = prec*10 + (c - '0');
12188 }
12189 }
12190 } /* prec */
12191 if (fmtcnt >= 0) {
12192 if (c == 'h' || c == 'l' || c == 'L') {
12193 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012195 }
12196 }
12197 if (fmtcnt < 0) {
12198 PyErr_SetString(PyExc_ValueError,
12199 "incomplete format");
12200 goto onError;
12201 }
12202 if (c != '%') {
12203 v = getnextarg(args, arglen, &argidx);
12204 if (v == NULL)
12205 goto onError;
12206 }
12207 sign = 0;
12208 fill = ' ';
12209 switch (c) {
12210
12211 case '%':
12212 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012214 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012216 len = 1;
12217 break;
12218
12219 case 's':
12220 case 'r':
12221 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012222 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012223 temp = v;
12224 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012225 }
12226 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012227 if (c == 's')
12228 temp = PyObject_Str(v);
12229 else if (c == 'r')
12230 temp = PyObject_Repr(v);
12231 else
12232 temp = PyObject_ASCII(v);
12233 if (temp == NULL)
12234 goto onError;
12235 if (PyUnicode_Check(temp))
12236 /* nothing to do */;
12237 else {
12238 Py_DECREF(temp);
12239 PyErr_SetString(PyExc_TypeError,
12240 "%s argument has non-string str()");
12241 goto onError;
12242 }
12243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244 if (PyUnicode_READY(temp) == -1) {
12245 Py_CLEAR(temp);
12246 goto onError;
12247 }
12248 pbuf = PyUnicode_DATA(temp);
12249 kind = PyUnicode_KIND(temp);
12250 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012251 if (prec >= 0 && len > prec)
12252 len = prec;
12253 break;
12254
12255 case 'i':
12256 case 'd':
12257 case 'u':
12258 case 'o':
12259 case 'x':
12260 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012261 isnumok = 0;
12262 if (PyNumber_Check(v)) {
12263 PyObject *iobj=NULL;
12264
12265 if (PyLong_Check(v)) {
12266 iobj = v;
12267 Py_INCREF(iobj);
12268 }
12269 else {
12270 iobj = PyNumber_Long(v);
12271 }
12272 if (iobj!=NULL) {
12273 if (PyLong_Check(iobj)) {
12274 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012275 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012276 Py_DECREF(iobj);
12277 if (!temp)
12278 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 if (PyUnicode_READY(temp) == -1) {
12280 Py_CLEAR(temp);
12281 goto onError;
12282 }
12283 pbuf = PyUnicode_DATA(temp);
12284 kind = PyUnicode_KIND(temp);
12285 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 sign = 1;
12287 }
12288 else {
12289 Py_DECREF(iobj);
12290 }
12291 }
12292 }
12293 if (!isnumok) {
12294 PyErr_Format(PyExc_TypeError,
12295 "%%%c format: a number is required, "
12296 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12297 goto onError;
12298 }
12299 if (flags & F_ZERO)
12300 fill = '0';
12301 break;
12302
12303 case 'e':
12304 case 'E':
12305 case 'f':
12306 case 'F':
12307 case 'g':
12308 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012309 temp = formatfloat(v, flags, prec, c);
12310 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 if (PyUnicode_READY(temp) == -1) {
12313 Py_CLEAR(temp);
12314 goto onError;
12315 }
12316 pbuf = PyUnicode_DATA(temp);
12317 kind = PyUnicode_KIND(temp);
12318 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012319 sign = 1;
12320 if (flags & F_ZERO)
12321 fill = '0';
12322 break;
12323
12324 case 'c':
12325 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012327 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012328 if (len < 0)
12329 goto onError;
12330 break;
12331
12332 default:
12333 PyErr_Format(PyExc_ValueError,
12334 "unsupported format character '%c' (0x%x) "
12335 "at index %zd",
12336 (31<=c && c<=126) ? (char)c : '?',
12337 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012339 goto onError;
12340 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 /* pbuf is initialized here. */
12342 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12345 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12346 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012347 len--;
12348 }
12349 else if (flags & F_SIGN)
12350 sign = '+';
12351 else if (flags & F_BLANK)
12352 sign = ' ';
12353 else
12354 sign = 0;
12355 }
12356 if (width < len)
12357 width = len;
12358 if (rescnt - (sign != 0) < width) {
12359 reslen -= rescnt;
12360 rescnt = width + fmtcnt + 100;
12361 reslen += rescnt;
12362 if (reslen < 0) {
12363 Py_XDECREF(temp);
12364 PyErr_NoMemory();
12365 goto onError;
12366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12368 if (res0 == 0) {
12369 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012370 Py_XDECREF(temp);
12371 goto onError;
12372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012374 }
12375 if (sign) {
12376 if (fill != ' ')
12377 *res++ = sign;
12378 rescnt--;
12379 if (width > len)
12380 width--;
12381 }
12382 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012383 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12384 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012385 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12387 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 }
12389 rescnt -= 2;
12390 width -= 2;
12391 if (width < 0)
12392 width = 0;
12393 len -= 2;
12394 }
12395 if (width > len && !(flags & F_LJUST)) {
12396 do {
12397 --rescnt;
12398 *res++ = fill;
12399 } while (--width > len);
12400 }
12401 if (fill == ' ') {
12402 if (sign)
12403 *res++ = sign;
12404 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12406 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12407 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12408 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012409 }
12410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 /* Copy all characters, preserving len */
12412 len1 = len;
12413 while (len1--) {
12414 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12415 rescnt--;
12416 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012417 while (--width >= len) {
12418 --rescnt;
12419 *res++ = ' ';
12420 }
12421 if (dict && (argidx < arglen) && c != '%') {
12422 PyErr_SetString(PyExc_TypeError,
12423 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012424 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012425 goto onError;
12426 }
12427 Py_XDECREF(temp);
12428 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012429 } /* until end */
12430 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012431 PyErr_SetString(PyExc_TypeError,
12432 "not all arguments converted during string formatting");
12433 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434 }
12435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436
12437 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12438 if (*res > max)
12439 max = *res;
12440 result = PyUnicode_New(reslen - rescnt, max);
12441 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012442 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 kind = PyUnicode_KIND(result);
12444 for (res = res0; res < res0+reslen-rescnt; res++)
12445 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12446 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012448 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449 }
12450 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451 return (PyObject *)result;
12452
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012455 Py_DECREF(uformat);
12456 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012457 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458 }
12459 return NULL;
12460}
12461
Jeremy Hylton938ace62002-07-17 16:30:39 +000012462static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012463unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12464
Tim Peters6d6c1a32001-08-02 04:15:00 +000012465static PyObject *
12466unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12467{
Benjamin Peterson29060642009-01-31 22:14:21 +000012468 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012469 static char *kwlist[] = {"object", "encoding", "errors", 0};
12470 char *encoding = NULL;
12471 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012472
Benjamin Peterson14339b62009-01-31 16:36:08 +000012473 if (type != &PyUnicode_Type)
12474 return unicode_subtype_new(type, args, kwds);
12475 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012476 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012477 return NULL;
12478 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012480 if (encoding == NULL && errors == NULL)
12481 return PyObject_Str(x);
12482 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012483 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012484}
12485
Guido van Rossume023fe02001-08-30 03:12:59 +000012486static PyObject *
12487unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12488{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012489 PyUnicodeObject *unicode, *self;
12490 Py_ssize_t length, char_size;
12491 int share_wstr, share_utf8;
12492 unsigned int kind;
12493 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012494
Benjamin Peterson14339b62009-01-31 16:36:08 +000012495 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012496
12497 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12498 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012499 return NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012500 assert(PyUnicode_Check(unicode));
12501 if (PyUnicode_READY(unicode))
12502 return NULL;
12503
12504 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12505 if (self == NULL) {
12506 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012507 return NULL;
12508 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012509 kind = PyUnicode_KIND(unicode);
12510 length = PyUnicode_GET_LENGTH(unicode);
12511
12512 _PyUnicode_LENGTH(self) = length;
12513 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12514 _PyUnicode_STATE(self).interned = 0;
12515 _PyUnicode_STATE(self).kind = kind;
12516 _PyUnicode_STATE(self).compact = 0;
12517 _PyUnicode_STATE(self).ascii = 0;
12518 _PyUnicode_STATE(self).ready = 1;
12519 _PyUnicode_WSTR(self) = NULL;
12520 _PyUnicode_UTF8_LENGTH(self) = 0;
12521 _PyUnicode_UTF8(self) = NULL;
12522 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012523 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012524
12525 share_utf8 = 0;
12526 share_wstr = 0;
12527 if (kind == PyUnicode_1BYTE_KIND) {
12528 char_size = 1;
12529 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12530 share_utf8 = 1;
12531 }
12532 else if (kind == PyUnicode_2BYTE_KIND) {
12533 char_size = 2;
12534 if (sizeof(wchar_t) == 2)
12535 share_wstr = 1;
12536 }
12537 else {
12538 assert(kind == PyUnicode_4BYTE_KIND);
12539 char_size = 4;
12540 if (sizeof(wchar_t) == 4)
12541 share_wstr = 1;
12542 }
12543
12544 /* Ensure we won't overflow the length. */
12545 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12546 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012548 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012549 data = PyObject_MALLOC((length + 1) * char_size);
12550 if (data == NULL) {
12551 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 goto onError;
12553 }
12554
Victor Stinnerc3c74152011-10-02 20:39:55 +020012555 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012556 if (share_utf8) {
12557 _PyUnicode_UTF8_LENGTH(self) = length;
12558 _PyUnicode_UTF8(self) = data;
12559 }
12560 if (share_wstr) {
12561 _PyUnicode_WSTR_LENGTH(self) = length;
12562 _PyUnicode_WSTR(self) = (wchar_t *)data;
12563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012565 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12566 PyUnicode_KIND_SIZE(kind, length + 1));
12567 Py_DECREF(unicode);
12568 return (PyObject *)self;
12569
12570onError:
12571 Py_DECREF(unicode);
12572 Py_DECREF(self);
12573 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012574}
12575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012576PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012577 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012578\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012579Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012580encoding defaults to the current default string encoding.\n\
12581errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012582
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012583static PyObject *unicode_iter(PyObject *seq);
12584
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012586 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012587 "str", /* tp_name */
12588 sizeof(PyUnicodeObject), /* tp_size */
12589 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012591 (destructor)unicode_dealloc, /* tp_dealloc */
12592 0, /* tp_print */
12593 0, /* tp_getattr */
12594 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012595 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012596 unicode_repr, /* tp_repr */
12597 &unicode_as_number, /* tp_as_number */
12598 &unicode_as_sequence, /* tp_as_sequence */
12599 &unicode_as_mapping, /* tp_as_mapping */
12600 (hashfunc) unicode_hash, /* tp_hash*/
12601 0, /* tp_call*/
12602 (reprfunc) unicode_str, /* tp_str */
12603 PyObject_GenericGetAttr, /* tp_getattro */
12604 0, /* tp_setattro */
12605 0, /* tp_as_buffer */
12606 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012607 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012608 unicode_doc, /* tp_doc */
12609 0, /* tp_traverse */
12610 0, /* tp_clear */
12611 PyUnicode_RichCompare, /* tp_richcompare */
12612 0, /* tp_weaklistoffset */
12613 unicode_iter, /* tp_iter */
12614 0, /* tp_iternext */
12615 unicode_methods, /* tp_methods */
12616 0, /* tp_members */
12617 0, /* tp_getset */
12618 &PyBaseObject_Type, /* tp_base */
12619 0, /* tp_dict */
12620 0, /* tp_descr_get */
12621 0, /* tp_descr_set */
12622 0, /* tp_dictoffset */
12623 0, /* tp_init */
12624 0, /* tp_alloc */
12625 unicode_new, /* tp_new */
12626 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627};
12628
12629/* Initialize the Unicode implementation */
12630
Thomas Wouters78890102000-07-22 19:25:51 +000012631void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012633 int i;
12634
Thomas Wouters477c8d52006-05-27 19:21:47 +000012635 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012637 0x000A, /* LINE FEED */
12638 0x000D, /* CARRIAGE RETURN */
12639 0x001C, /* FILE SEPARATOR */
12640 0x001D, /* GROUP SEPARATOR */
12641 0x001E, /* RECORD SEPARATOR */
12642 0x0085, /* NEXT LINE */
12643 0x2028, /* LINE SEPARATOR */
12644 0x2029, /* PARAGRAPH SEPARATOR */
12645 };
12646
Fred Drakee4315f52000-05-09 19:53:39 +000012647 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012648 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012649 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012651
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012652 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012653 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012654 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012655 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012656
12657 /* initialize the linebreak bloom filter */
12658 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012660 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012661
12662 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663}
12664
12665/* Finalize the Unicode implementation */
12666
Christian Heimesa156e092008-02-16 07:38:31 +000012667int
12668PyUnicode_ClearFreeList(void)
12669{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012671}
12672
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673void
Thomas Wouters78890102000-07-22 19:25:51 +000012674_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012676 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012678 Py_XDECREF(unicode_empty);
12679 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012680
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012681 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012682 if (unicode_latin1[i]) {
12683 Py_DECREF(unicode_latin1[i]);
12684 unicode_latin1[i] = NULL;
12685 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012686 }
Christian Heimesa156e092008-02-16 07:38:31 +000012687 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012689
Walter Dörwald16807132007-05-25 13:52:07 +000012690void
12691PyUnicode_InternInPlace(PyObject **p)
12692{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012693 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12694 PyObject *t;
12695 if (s == NULL || !PyUnicode_Check(s))
12696 Py_FatalError(
12697 "PyUnicode_InternInPlace: unicode strings only please!");
12698 /* If it's a subclass, we don't really know what putting
12699 it in the interned dict might do. */
12700 if (!PyUnicode_CheckExact(s))
12701 return;
12702 if (PyUnicode_CHECK_INTERNED(s))
12703 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 if (PyUnicode_READY(s) == -1) {
12705 assert(0 && "ready fail in intern...");
12706 return;
12707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012708 if (interned == NULL) {
12709 interned = PyDict_New();
12710 if (interned == NULL) {
12711 PyErr_Clear(); /* Don't leave an exception */
12712 return;
12713 }
12714 }
12715 /* It might be that the GetItem call fails even
12716 though the key is present in the dictionary,
12717 namely when this happens during a stack overflow. */
12718 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012719 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012720 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012721
Benjamin Peterson29060642009-01-31 22:14:21 +000012722 if (t) {
12723 Py_INCREF(t);
12724 Py_DECREF(*p);
12725 *p = t;
12726 return;
12727 }
Walter Dörwald16807132007-05-25 13:52:07 +000012728
Benjamin Peterson14339b62009-01-31 16:36:08 +000012729 PyThreadState_GET()->recursion_critical = 1;
12730 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12731 PyErr_Clear();
12732 PyThreadState_GET()->recursion_critical = 0;
12733 return;
12734 }
12735 PyThreadState_GET()->recursion_critical = 0;
12736 /* The two references in interned are not counted by refcnt.
12737 The deallocator will take care of this */
12738 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012740}
12741
12742void
12743PyUnicode_InternImmortal(PyObject **p)
12744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12746
Benjamin Peterson14339b62009-01-31 16:36:08 +000012747 PyUnicode_InternInPlace(p);
12748 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012750 Py_INCREF(*p);
12751 }
Walter Dörwald16807132007-05-25 13:52:07 +000012752}
12753
12754PyObject *
12755PyUnicode_InternFromString(const char *cp)
12756{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012757 PyObject *s = PyUnicode_FromString(cp);
12758 if (s == NULL)
12759 return NULL;
12760 PyUnicode_InternInPlace(&s);
12761 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012762}
12763
Alexander Belopolsky40018472011-02-26 01:02:56 +000012764void
12765_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012766{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012767 PyObject *keys;
12768 PyUnicodeObject *s;
12769 Py_ssize_t i, n;
12770 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012771
Benjamin Peterson14339b62009-01-31 16:36:08 +000012772 if (interned == NULL || !PyDict_Check(interned))
12773 return;
12774 keys = PyDict_Keys(interned);
12775 if (keys == NULL || !PyList_Check(keys)) {
12776 PyErr_Clear();
12777 return;
12778 }
Walter Dörwald16807132007-05-25 13:52:07 +000012779
Benjamin Peterson14339b62009-01-31 16:36:08 +000012780 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12781 detector, interned unicode strings are not forcibly deallocated;
12782 rather, we give them their stolen references back, and then clear
12783 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012784
Benjamin Peterson14339b62009-01-31 16:36:08 +000012785 n = PyList_GET_SIZE(keys);
12786 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012787 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012788 for (i = 0; i < n; i++) {
12789 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790 if (PyUnicode_READY(s) == -1)
12791 fprintf(stderr, "could not ready string\n");
12792 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012793 case SSTATE_NOT_INTERNED:
12794 /* XXX Shouldn't happen */
12795 break;
12796 case SSTATE_INTERNED_IMMORTAL:
12797 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012799 break;
12800 case SSTATE_INTERNED_MORTAL:
12801 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012803 break;
12804 default:
12805 Py_FatalError("Inconsistent interned string state.");
12806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012808 }
12809 fprintf(stderr, "total size of all interned strings: "
12810 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12811 "mortal/immortal\n", mortal_size, immortal_size);
12812 Py_DECREF(keys);
12813 PyDict_Clear(interned);
12814 Py_DECREF(interned);
12815 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012816}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012817
12818
12819/********************* Unicode Iterator **************************/
12820
12821typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012822 PyObject_HEAD
12823 Py_ssize_t it_index;
12824 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012825} unicodeiterobject;
12826
12827static void
12828unicodeiter_dealloc(unicodeiterobject *it)
12829{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012830 _PyObject_GC_UNTRACK(it);
12831 Py_XDECREF(it->it_seq);
12832 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012833}
12834
12835static int
12836unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12837{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012838 Py_VISIT(it->it_seq);
12839 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012840}
12841
12842static PyObject *
12843unicodeiter_next(unicodeiterobject *it)
12844{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012845 PyUnicodeObject *seq;
12846 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012847
Benjamin Peterson14339b62009-01-31 16:36:08 +000012848 assert(it != NULL);
12849 seq = it->it_seq;
12850 if (seq == NULL)
12851 return NULL;
12852 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12855 int kind = PyUnicode_KIND(seq);
12856 void *data = PyUnicode_DATA(seq);
12857 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12858 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012859 if (item != NULL)
12860 ++it->it_index;
12861 return item;
12862 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012863
Benjamin Peterson14339b62009-01-31 16:36:08 +000012864 Py_DECREF(seq);
12865 it->it_seq = NULL;
12866 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012867}
12868
12869static PyObject *
12870unicodeiter_len(unicodeiterobject *it)
12871{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012872 Py_ssize_t len = 0;
12873 if (it->it_seq)
12874 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12875 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012876}
12877
12878PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12879
12880static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012881 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012882 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012883 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012884};
12885
12886PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012887 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12888 "str_iterator", /* tp_name */
12889 sizeof(unicodeiterobject), /* tp_basicsize */
12890 0, /* tp_itemsize */
12891 /* methods */
12892 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12893 0, /* tp_print */
12894 0, /* tp_getattr */
12895 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012896 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012897 0, /* tp_repr */
12898 0, /* tp_as_number */
12899 0, /* tp_as_sequence */
12900 0, /* tp_as_mapping */
12901 0, /* tp_hash */
12902 0, /* tp_call */
12903 0, /* tp_str */
12904 PyObject_GenericGetAttr, /* tp_getattro */
12905 0, /* tp_setattro */
12906 0, /* tp_as_buffer */
12907 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12908 0, /* tp_doc */
12909 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12910 0, /* tp_clear */
12911 0, /* tp_richcompare */
12912 0, /* tp_weaklistoffset */
12913 PyObject_SelfIter, /* tp_iter */
12914 (iternextfunc)unicodeiter_next, /* tp_iternext */
12915 unicodeiter_methods, /* tp_methods */
12916 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012917};
12918
12919static PyObject *
12920unicode_iter(PyObject *seq)
12921{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012922 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012923
Benjamin Peterson14339b62009-01-31 16:36:08 +000012924 if (!PyUnicode_Check(seq)) {
12925 PyErr_BadInternalCall();
12926 return NULL;
12927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928 if (PyUnicode_READY(seq) == -1)
12929 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012930 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12931 if (it == NULL)
12932 return NULL;
12933 it->it_index = 0;
12934 Py_INCREF(seq);
12935 it->it_seq = (PyUnicodeObject *)seq;
12936 _PyObject_GC_TRACK(it);
12937 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012938}
12939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012940#define UNIOP(x) Py_UNICODE_##x
12941#define UNIOP_t Py_UNICODE
12942#include "uniops.h"
12943#undef UNIOP
12944#undef UNIOP_t
12945#define UNIOP(x) Py_UCS4_##x
12946#define UNIOP_t Py_UCS4
12947#include "uniops.h"
12948#undef UNIOP
12949#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012950
Victor Stinner71133ff2010-09-01 23:43:53 +000012951Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012952PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012953{
12954 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12955 Py_UNICODE *copy;
12956 Py_ssize_t size;
12957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012958 if (!PyUnicode_Check(unicode)) {
12959 PyErr_BadArgument();
12960 return NULL;
12961 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012962 /* Ensure we won't overflow the size. */
12963 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12964 PyErr_NoMemory();
12965 return NULL;
12966 }
12967 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12968 size *= sizeof(Py_UNICODE);
12969 copy = PyMem_Malloc(size);
12970 if (copy == NULL) {
12971 PyErr_NoMemory();
12972 return NULL;
12973 }
12974 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12975 return copy;
12976}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012977
Georg Brandl66c221e2010-10-14 07:04:07 +000012978/* A _string module, to export formatter_parser and formatter_field_name_split
12979 to the string.Formatter class implemented in Python. */
12980
12981static PyMethodDef _string_methods[] = {
12982 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12983 METH_O, PyDoc_STR("split the argument as a field name")},
12984 {"formatter_parser", (PyCFunction) formatter_parser,
12985 METH_O, PyDoc_STR("parse the argument as a format string")},
12986 {NULL, NULL}
12987};
12988
12989static struct PyModuleDef _string_module = {
12990 PyModuleDef_HEAD_INIT,
12991 "_string",
12992 PyDoc_STR("string helper module"),
12993 0,
12994 _string_methods,
12995 NULL,
12996 NULL,
12997 NULL,
12998 NULL
12999};
13000
13001PyMODINIT_FUNC
13002PyInit__string(void)
13003{
13004 return PyModule_Create(&_string_module);
13005}
13006
13007
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013008#ifdef __cplusplus
13009}
13010#endif