blob: 3b16959f971e1e42a2f0ce3755e7c931b2529acf [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092/* Generic helper macro to convert characters of different types.
93 from_type and to_type have to be valid type names, begin and end
94 are pointers to the source characters which should be of type
95 "from_type *". to is a pointer of type "to_type *" and points to the
96 buffer where the result characters are written to. */
97#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
98 do { \
99 const from_type *iter_; to_type *to_; \
100 for (iter_ = (begin), to_ = (to_type *)(to); \
101 iter_ < (end); \
102 ++iter_, ++to_) { \
103 *to_ = (to_type)*iter_; \
104 } \
105 } while (0)
106
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107#define _PyUnicode_UTF8(op) \
108 (((PyCompactUnicodeObject*)(op))->utf8)
109#define PyUnicode_UTF8(op) \
110 (assert(PyUnicode_Check(op)), \
111 assert(PyUnicode_IS_READY(op)), \
112 PyUnicode_IS_COMPACT_ASCII(op) ? \
113 ((char*)((PyASCIIObject*)(op) + 1)) : \
114 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200115#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 (((PyCompactUnicodeObject*)(op))->utf8_length)
117#define PyUnicode_UTF8_LENGTH(op) \
118 (assert(PyUnicode_Check(op)), \
119 assert(PyUnicode_IS_READY(op)), \
120 PyUnicode_IS_COMPACT_ASCII(op) ? \
121 ((PyASCIIObject*)(op))->length : \
122 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
124#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
126#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
128#define _PyUnicode_KIND(op) \
129 (assert(PyUnicode_Check(op)), \
130 ((PyASCIIObject *)(op))->state.kind)
131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(PyUnicode_Check(op)), \
133 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200134#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200135
Victor Stinner829c0ad2011-10-03 01:08:02 +0200136/* true if the Unicode object has an allocated UTF-8 memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_UTF8_MEMORY(op) \
139 (assert(PyUnicode_Check(op)), \
140 (!PyUnicode_IS_COMPACT_ASCII(op) \
141 && _PyUnicode_UTF8(op) \
142 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
143
144
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200145/* The Unicode string has been modified: reset the hash */
146#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
147
Walter Dörwald16807132007-05-25 13:52:07 +0000148/* This dictionary holds all interned unicode strings. Note that references
149 to strings in this dictionary are *not* counted in the string's ob_refcnt.
150 When the interned string reaches a refcnt of 0 the string deallocation
151 function will delete the reference from this dictionary.
152
153 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000154 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000155*/
156static PyObject *interned;
157
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000158/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200159static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000160
161/* Single character Unicode strings in the Latin-1 range are being
162 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200163static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000164
Christian Heimes190d79e2008-01-30 11:58:22 +0000165/* Fast detection of the most frequent whitespace characters */
166const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000168/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000169/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000170/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* case 0x000C: * FORM FEED */
172/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000173 0, 1, 1, 1, 1, 1, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000175/* case 0x001C: * FILE SEPARATOR */
176/* case 0x001D: * GROUP SEPARATOR */
177/* case 0x001E: * RECORD SEPARATOR */
178/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000179 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000180/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000181 1, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000185
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000194};
195
Alexander Belopolsky40018472011-02-26 01:02:56 +0000196static PyObject *
197unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000198 PyObject **errorHandler,const char *encoding, const char *reason,
199 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
200 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
201
Alexander Belopolsky40018472011-02-26 01:02:56 +0000202static void
203raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300204 const char *encoding,
205 const Py_UNICODE *unicode, Py_ssize_t size,
206 Py_ssize_t startpos, Py_ssize_t endpos,
207 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000208
Christian Heimes190d79e2008-01-30 11:58:22 +0000209/* Same for linebreaks */
210static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* 0x000B, * LINE TABULATION */
214/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000215/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000216 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* 0x001C, * FILE SEPARATOR */
219/* 0x001D, * GROUP SEPARATOR */
220/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000221 0, 0, 0, 0, 1, 1, 1, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000226
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000235};
236
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300237/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
238 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000239Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000240PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000241{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000242#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000243 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000244#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 /* This is actually an illegal character, so it should
246 not be passed to unichr. */
247 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000248#endif
249}
250
Thomas Wouters477c8d52006-05-27 19:21:47 +0000251/* --- Bloom Filters ----------------------------------------------------- */
252
253/* stuff to implement simple "bloom filters" for Unicode characters.
254 to keep things simple, we use a single bitmask, using the least 5
255 bits from each unicode characters as the bit index. */
256
257/* the linebreak mask is set up by Unicode_Init below */
258
Antoine Pitrouf068f942010-01-13 14:19:12 +0000259#if LONG_BIT >= 128
260#define BLOOM_WIDTH 128
261#elif LONG_BIT >= 64
262#define BLOOM_WIDTH 64
263#elif LONG_BIT >= 32
264#define BLOOM_WIDTH 32
265#else
266#error "LONG_BIT is smaller than 32"
267#endif
268
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269#define BLOOM_MASK unsigned long
270
271static BLOOM_MASK bloom_linebreak;
272
Antoine Pitrouf068f942010-01-13 14:19:12 +0000273#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
274#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000275
Benjamin Peterson29060642009-01-31 22:14:21 +0000276#define BLOOM_LINEBREAK(ch) \
277 ((ch) < 128U ? ascii_linebreak[(ch)] : \
278 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000279
Alexander Belopolsky40018472011-02-26 01:02:56 +0000280Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282{
283 /* calculate simple bloom-style bitmask for a given unicode string */
284
Antoine Pitrouf068f942010-01-13 14:19:12 +0000285 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000286 Py_ssize_t i;
287
288 mask = 0;
289 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200290 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000291
292 return mask;
293}
294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200295#define BLOOM_MEMBER(mask, chr, str) \
296 (BLOOM(mask, chr) \
297 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000298
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299/* --- Unicode Object ----------------------------------------------------- */
300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200301static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200302fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
303
304Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
305 Py_ssize_t size, Py_UCS4 ch,
306 int direction)
307{
308 /* like wcschr, but doesn't stop at NULL characters */
309 Py_ssize_t i;
310 if (direction == 1) {
311 for(i = 0; i < size; i++)
312 if (PyUnicode_READ(kind, s, i) == ch)
313 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
314 }
315 else {
316 for(i = size-1; i >= 0; i--)
317 if (PyUnicode_READ(kind, s, i) == ch)
318 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
319 }
320 return NULL;
321}
322
Alexander Belopolsky40018472011-02-26 01:02:56 +0000323static int
324unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200325 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326{
327 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200329 /* Resizing is only supported for old unicode objects. */
330 assert(!PyUnicode_IS_COMPACT(unicode));
331 assert(_PyUnicode_WSTR(unicode) != NULL);
332
333 /* ... and only if they have not been readied yet, because
334 callees usually rely on the wstr representation when resizing. */
335 assert(unicode->data.any == NULL);
336
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000337 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200338 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000341 /* Resizing shared object (unicode_empty or single character
342 objects) in-place is not allowed. Use PyUnicode_Resize()
343 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000344
Benjamin Peterson14339b62009-01-31 16:36:08 +0000345 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
347 _PyUnicode_WSTR(unicode)[0] < 256U &&
348 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000350 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 return -1;
352 }
353
Thomas Wouters477c8d52006-05-27 19:21:47 +0000354 /* We allocate one more byte to make sure the string is Ux0000 terminated.
355 The overallocation is also used by fastsearch, which assumes that it's
356 safe to look at str[length] (without making any assumptions about what
357 it contains). */
358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359 oldstr = _PyUnicode_WSTR(unicode);
360 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
361 sizeof(Py_UNICODE) * (length + 1));
362 if (!_PyUnicode_WSTR(unicode)) {
363 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364 PyErr_NoMemory();
365 return -1;
366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200367 _PyUnicode_WSTR(unicode)[length] = 0;
368 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369
Benjamin Peterson29060642009-01-31 22:14:21 +0000370 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200371 if (unicode->data.any != NULL) {
372 PyObject_FREE(unicode->data.any);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200373 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != unicode->data.any) {
374 PyObject_FREE(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200375 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200376 _PyUnicode_UTF8(unicode) = NULL;
377 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200378 unicode->data.any = NULL;
379 _PyUnicode_LENGTH(unicode) = 0;
380 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
381 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200383 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000384
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 return 0;
386}
387
388/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000389 Ux0000 terminated; some code (e.g. new_identifier)
390 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391
392 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394
395*/
396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200397#ifdef Py_DEBUG
398int unicode_old_new_calls = 0;
399#endif
400
Alexander Belopolsky40018472011-02-26 01:02:56 +0000401static PyUnicodeObject *
402_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000403{
404 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200405 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000406
Thomas Wouters477c8d52006-05-27 19:21:47 +0000407 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 if (length == 0 && unicode_empty != NULL) {
409 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200410 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000411 }
412
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000413 /* Ensure we won't overflow the size. */
414 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
415 return (PyUnicodeObject *)PyErr_NoMemory();
416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200417 if (length < 0) {
418 PyErr_SetString(PyExc_SystemError,
419 "Negative size passed to _PyUnicode_New");
420 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421 }
422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200423#ifdef Py_DEBUG
424 ++unicode_old_new_calls;
425#endif
426
427 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
428 if (unicode == NULL)
429 return NULL;
430 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
431 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
432 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyErr_NoMemory();
434 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200436
Jeremy Hyltond8082792003-09-16 19:41:39 +0000437 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000438 * the caller fails before initializing str -- unicode_resize()
439 * reads str[0], and the Keep-Alive optimization can keep memory
440 * allocated for str alive across a call to unicode_dealloc(unicode).
441 * We don't want unicode_resize to read uninitialized memory in
442 * that case.
443 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200444 _PyUnicode_WSTR(unicode)[0] = 0;
445 _PyUnicode_WSTR(unicode)[length] = 0;
446 _PyUnicode_WSTR_LENGTH(unicode) = length;
447 _PyUnicode_HASH(unicode) = -1;
448 _PyUnicode_STATE(unicode).interned = 0;
449 _PyUnicode_STATE(unicode).kind = 0;
450 _PyUnicode_STATE(unicode).compact = 0;
451 _PyUnicode_STATE(unicode).ready = 0;
452 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200453 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200455 _PyUnicode_UTF8(unicode) = NULL;
456 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000457 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000458
Benjamin Peterson29060642009-01-31 22:14:21 +0000459 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000460 /* XXX UNREF/NEWREF interface should be more symmetrical */
461 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000462 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000463 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000464 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000465}
466
Victor Stinnerf42dc442011-10-02 23:33:16 +0200467static const char*
468unicode_kind_name(PyObject *unicode)
469{
470 assert(PyUnicode_Check(unicode));
471 if (!PyUnicode_IS_COMPACT(unicode))
472 {
473 if (!PyUnicode_IS_READY(unicode))
474 return "wstr";
475 switch(PyUnicode_KIND(unicode))
476 {
477 case PyUnicode_1BYTE_KIND:
478 if (PyUnicode_IS_COMPACT_ASCII(unicode))
479 return "legacy ascii";
480 else
481 return "legacy latin1";
482 case PyUnicode_2BYTE_KIND:
483 return "legacy UCS2";
484 case PyUnicode_4BYTE_KIND:
485 return "legacy UCS4";
486 default:
487 return "<legacy invalid kind>";
488 }
489 }
490 assert(PyUnicode_IS_READY(unicode));
491 switch(PyUnicode_KIND(unicode))
492 {
493 case PyUnicode_1BYTE_KIND:
494 if (PyUnicode_IS_COMPACT_ASCII(unicode))
495 return "ascii";
496 else
497 return "compact latin1";
498 case PyUnicode_2BYTE_KIND:
499 return "compact UCS2";
500 case PyUnicode_4BYTE_KIND:
501 return "compact UCS4";
502 default:
503 return "<invalid compact kind>";
504 }
505}
506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200507#ifdef Py_DEBUG
508int unicode_new_new_calls = 0;
509
510/* Functions wrapping macros for use in debugger */
511char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200512 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200513}
514
515void *_PyUnicode_compact_data(void *unicode) {
516 return _PyUnicode_COMPACT_DATA(unicode);
517}
518void *_PyUnicode_data(void *unicode){
519 printf("obj %p\n", unicode);
520 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
521 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
522 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
523 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
524 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
525 return PyUnicode_DATA(unicode);
526}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200527
528void
529_PyUnicode_Dump(PyObject *op)
530{
531 PyASCIIObject *ascii = (PyASCIIObject *)op;
532 printf("%s: len=%zu, wstr=%p",
533 unicode_kind_name(op),
534 ascii->length,
535 ascii->wstr);
536 if (!ascii->state.ascii) {
537 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
538 printf(" (%zu), utf8=%p (%zu)",
539 compact->wstr_length,
540 compact->utf8,
541 compact->utf8_length);
542 }
543 if (!ascii->state.compact) {
544 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
545 printf(", data=%p",
546 unicode->data.any);
547 }
548 printf("\n");
549}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550#endif
551
552PyObject *
553PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
554{
555 PyObject *obj;
556 PyCompactUnicodeObject *unicode;
557 void *data;
558 int kind_state;
559 int is_sharing = 0, is_ascii = 0;
560 Py_ssize_t char_size;
561 Py_ssize_t struct_size;
562
563 /* Optimization for empty strings */
564 if (size == 0 && unicode_empty != NULL) {
565 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200566 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200567 }
568
569#ifdef Py_DEBUG
570 ++unicode_new_new_calls;
571#endif
572
573 struct_size = sizeof(PyCompactUnicodeObject);
574 if (maxchar < 128) {
575 kind_state = PyUnicode_1BYTE_KIND;
576 char_size = 1;
577 is_ascii = 1;
578 struct_size = sizeof(PyASCIIObject);
579 }
580 else if (maxchar < 256) {
581 kind_state = PyUnicode_1BYTE_KIND;
582 char_size = 1;
583 }
584 else if (maxchar < 65536) {
585 kind_state = PyUnicode_2BYTE_KIND;
586 char_size = 2;
587 if (sizeof(wchar_t) == 2)
588 is_sharing = 1;
589 }
590 else {
591 kind_state = PyUnicode_4BYTE_KIND;
592 char_size = 4;
593 if (sizeof(wchar_t) == 4)
594 is_sharing = 1;
595 }
596
597 /* Ensure we won't overflow the size. */
598 if (size < 0) {
599 PyErr_SetString(PyExc_SystemError,
600 "Negative size passed to PyUnicode_New");
601 return NULL;
602 }
603 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
604 return PyErr_NoMemory();
605
606 /* Duplicated allocation code from _PyObject_New() instead of a call to
607 * PyObject_New() so we are able to allocate space for the object and
608 * it's data buffer.
609 */
610 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
611 if (obj == NULL)
612 return PyErr_NoMemory();
613 obj = PyObject_INIT(obj, &PyUnicode_Type);
614 if (obj == NULL)
615 return NULL;
616
617 unicode = (PyCompactUnicodeObject *)obj;
618 if (is_ascii)
619 data = ((PyASCIIObject*)obj) + 1;
620 else
621 data = unicode + 1;
622 _PyUnicode_LENGTH(unicode) = size;
623 _PyUnicode_HASH(unicode) = -1;
624 _PyUnicode_STATE(unicode).interned = 0;
625 _PyUnicode_STATE(unicode).kind = kind_state;
626 _PyUnicode_STATE(unicode).compact = 1;
627 _PyUnicode_STATE(unicode).ready = 1;
628 _PyUnicode_STATE(unicode).ascii = is_ascii;
629 if (is_ascii) {
630 ((char*)data)[size] = 0;
631 _PyUnicode_WSTR(unicode) = NULL;
632 }
633 else if (kind_state == PyUnicode_1BYTE_KIND) {
634 ((char*)data)[size] = 0;
635 _PyUnicode_WSTR(unicode) = NULL;
636 _PyUnicode_WSTR_LENGTH(unicode) = 0;
637 unicode->utf8_length = 0;
638 unicode->utf8 = NULL;
639 }
640 else {
641 unicode->utf8 = NULL;
642 if (kind_state == PyUnicode_2BYTE_KIND)
643 ((Py_UCS2*)data)[size] = 0;
644 else /* kind_state == PyUnicode_4BYTE_KIND */
645 ((Py_UCS4*)data)[size] = 0;
646 if (is_sharing) {
647 _PyUnicode_WSTR_LENGTH(unicode) = size;
648 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
649 }
650 else {
651 _PyUnicode_WSTR_LENGTH(unicode) = 0;
652 _PyUnicode_WSTR(unicode) = NULL;
653 }
654 }
655 return obj;
656}
657
658#if SIZEOF_WCHAR_T == 2
659/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
660 will decode surrogate pairs, the other conversions are implemented as macros
661 for efficency.
662
663 This function assumes that unicode can hold one more code point than wstr
664 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200665static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200666unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
667 PyUnicodeObject *unicode)
668{
669 const wchar_t *iter;
670 Py_UCS4 *ucs4_out;
671
672 assert(unicode && PyUnicode_Check(unicode));
673 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
674 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
675
676 for (iter = begin; iter < end; ) {
677 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
678 _PyUnicode_GET_LENGTH(unicode)));
679 if (*iter >= 0xD800 && *iter <= 0xDBFF
680 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
681 {
682 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
683 iter += 2;
684 }
685 else {
686 *ucs4_out++ = *iter;
687 iter++;
688 }
689 }
690 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
691 _PyUnicode_GET_LENGTH(unicode)));
692
693 return 0;
694}
695#endif
696
Victor Stinnercd9950f2011-10-02 00:34:53 +0200697static int
698_PyUnicode_Dirty(PyObject *unicode)
699{
700 assert(PyUnicode_Check(unicode));
701 if (Py_REFCNT(unicode) != 1) {
702 PyErr_SetString(PyExc_ValueError,
703 "Cannot modify a string having more than 1 reference");
704 return -1;
705 }
706 _PyUnicode_DIRTY(unicode);
707 return 0;
708}
709
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200710Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200711PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
712 PyObject *from, Py_ssize_t from_start,
713 Py_ssize_t how_many)
714{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200715 unsigned int from_kind, to_kind;
716 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717
Victor Stinnerb1536152011-09-30 02:26:10 +0200718 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
719 PyErr_BadInternalCall();
720 return -1;
721 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200722
723 if (PyUnicode_READY(from))
724 return -1;
725 if (PyUnicode_READY(to))
726 return -1;
727
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200728 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200729 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
730 PyErr_Format(PyExc_ValueError,
731 "Cannot write %zi characters at %zi "
732 "in a string of %zi characters",
733 how_many, to_start, PyUnicode_GET_LENGTH(to));
734 return -1;
735 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200736 if (how_many == 0)
737 return 0;
738
Victor Stinnercd9950f2011-10-02 00:34:53 +0200739 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200740 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200742 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200743 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200745 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200746
Victor Stinnerf42dc442011-10-02 23:33:16 +0200747 if (from_kind == to_kind
748 /* deny latin1 => ascii */
749 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
750 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200751 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200752 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200753 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200754 + PyUnicode_KIND_SIZE(from_kind, from_start),
755 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200756 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200757 else if (from_kind == PyUnicode_1BYTE_KIND
758 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200759 {
760 _PyUnicode_CONVERT_BYTES(
761 Py_UCS1, Py_UCS2,
762 PyUnicode_1BYTE_DATA(from) + from_start,
763 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
764 PyUnicode_2BYTE_DATA(to) + to_start
765 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200766 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200767 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200768 && to_kind == PyUnicode_4BYTE_KIND)
769 {
770 _PyUnicode_CONVERT_BYTES(
771 Py_UCS1, Py_UCS4,
772 PyUnicode_1BYTE_DATA(from) + from_start,
773 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
774 PyUnicode_4BYTE_DATA(to) + to_start
775 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200776 }
777 else if (from_kind == PyUnicode_2BYTE_KIND
778 && to_kind == PyUnicode_4BYTE_KIND)
779 {
780 _PyUnicode_CONVERT_BYTES(
781 Py_UCS2, Py_UCS4,
782 PyUnicode_2BYTE_DATA(from) + from_start,
783 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
784 PyUnicode_4BYTE_DATA(to) + to_start
785 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200786 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200787 else {
788 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200789
790 /* check if max_char(from substring) <= max_char(to) */
791 if (from_kind > to_kind
792 /* latin1 => ascii */
793 || (PyUnicode_IS_COMPACT_ASCII(to)
794 && to_kind == PyUnicode_1BYTE_KIND
795 && !PyUnicode_IS_COMPACT_ASCII(from)))
796 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200797 /* slow path to check for character overflow */
798 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
799 Py_UCS4 ch, maxchar;
800 Py_ssize_t i;
801
802 maxchar = 0;
803 invalid_kinds = 0;
804 for (i=0; i < how_many; i++) {
805 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
806 if (ch > maxchar) {
807 maxchar = ch;
808 if (maxchar > to_maxchar) {
809 invalid_kinds = 1;
810 break;
811 }
812 }
813 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
814 }
815 }
816 else
817 invalid_kinds = 1;
818 if (invalid_kinds) {
819 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200820 "Cannot copy %s characters "
821 "into a string of %s characters",
822 unicode_kind_name(from),
823 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200824 return -1;
825 }
826 }
827 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828}
829
Victor Stinner17222162011-09-28 22:15:37 +0200830/* Find the maximum code point and count the number of surrogate pairs so a
831 correct string length can be computed before converting a string to UCS4.
832 This function counts single surrogates as a character and not as a pair.
833
834 Return 0 on success, or -1 on error. */
835static int
836find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
837 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838{
839 const wchar_t *iter;
840
Victor Stinnerc53be962011-10-02 21:33:54 +0200841 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842 if (num_surrogates == NULL || maxchar == NULL) {
843 PyErr_SetString(PyExc_SystemError,
844 "unexpected NULL arguments to "
845 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
846 return -1;
847 }
848
849 *num_surrogates = 0;
850 *maxchar = 0;
851
852 for (iter = begin; iter < end; ) {
853 if (*iter > *maxchar)
854 *maxchar = *iter;
855#if SIZEOF_WCHAR_T == 2
856 if (*iter >= 0xD800 && *iter <= 0xDBFF
857 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
858 {
859 Py_UCS4 surrogate_val;
860 surrogate_val = (((iter[0] & 0x3FF)<<10)
861 | (iter[1] & 0x3FF)) + 0x10000;
862 ++(*num_surrogates);
863 if (surrogate_val > *maxchar)
864 *maxchar = surrogate_val;
865 iter += 2;
866 }
867 else
868 iter++;
869#else
870 iter++;
871#endif
872 }
873 return 0;
874}
875
876#ifdef Py_DEBUG
877int unicode_ready_calls = 0;
878#endif
879
880int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200881_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200883 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 wchar_t *end;
885 Py_UCS4 maxchar = 0;
886 Py_ssize_t num_surrogates;
887#if SIZEOF_WCHAR_T == 2
888 Py_ssize_t length_wo_surrogates;
889#endif
890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200891 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200892 strings were created using _PyObject_New() and where no canonical
893 representation (the str field) has been set yet aka strings
894 which are not yet ready. */
895 assert(PyUnicode_Check(obj));
896 assert(!PyUnicode_IS_READY(obj));
897 assert(!PyUnicode_IS_COMPACT(obj));
898 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +0200900 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200901 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200902 /* Actually, it should neither be interned nor be anything else: */
903 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904
905#ifdef Py_DEBUG
906 ++unicode_ready_calls;
907#endif
908
909 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200910 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200911 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200913
914 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +0200915 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
916 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 PyErr_NoMemory();
918 return -1;
919 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200920 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921 _PyUnicode_WSTR(unicode), end,
922 PyUnicode_1BYTE_DATA(unicode));
923 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
924 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
925 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
926 if (maxchar < 128) {
Victor Stinnerc3c74152011-10-02 20:39:55 +0200927 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200928 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200929 }
930 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200931 _PyUnicode_UTF8(unicode) = NULL;
932 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200933 }
934 PyObject_FREE(_PyUnicode_WSTR(unicode));
935 _PyUnicode_WSTR(unicode) = NULL;
936 _PyUnicode_WSTR_LENGTH(unicode) = 0;
937 }
938 /* In this case we might have to convert down from 4-byte native
939 wchar_t to 2-byte unicode. */
940 else if (maxchar < 65536) {
941 assert(num_surrogates == 0 &&
942 "FindMaxCharAndNumSurrogatePairs() messed up");
943
Victor Stinner506f5922011-09-28 22:34:18 +0200944#if SIZEOF_WCHAR_T == 2
945 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +0200946 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +0200947 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
948 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
949 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200950 _PyUnicode_UTF8(unicode) = NULL;
951 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200952#else
953 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +0200954 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +0200955 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +0200956 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +0200957 PyErr_NoMemory();
958 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200959 }
Victor Stinner506f5922011-09-28 22:34:18 +0200960 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
961 _PyUnicode_WSTR(unicode), end,
962 PyUnicode_2BYTE_DATA(unicode));
963 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
964 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
965 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200966 _PyUnicode_UTF8(unicode) = NULL;
967 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200968 PyObject_FREE(_PyUnicode_WSTR(unicode));
969 _PyUnicode_WSTR(unicode) = NULL;
970 _PyUnicode_WSTR_LENGTH(unicode) = 0;
971#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 }
973 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
974 else {
975#if SIZEOF_WCHAR_T == 2
976 /* in case the native representation is 2-bytes, we need to allocate a
977 new normalized 4-byte version. */
978 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200979 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
980 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981 PyErr_NoMemory();
982 return -1;
983 }
984 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
985 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200986 _PyUnicode_UTF8(unicode) = NULL;
987 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinnerc53be962011-10-02 21:33:54 +0200988 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989 PyObject_FREE(_PyUnicode_WSTR(unicode));
990 _PyUnicode_WSTR(unicode) = NULL;
991 _PyUnicode_WSTR_LENGTH(unicode) = 0;
992#else
993 assert(num_surrogates == 0);
994
Victor Stinnerc3c74152011-10-02 20:39:55 +0200995 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200996 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200997 _PyUnicode_UTF8(unicode) = NULL;
998 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1000#endif
1001 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1002 }
1003 _PyUnicode_STATE(unicode).ready = 1;
1004 return 0;
1005}
1006
Alexander Belopolsky40018472011-02-26 01:02:56 +00001007static void
1008unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001009{
Walter Dörwald16807132007-05-25 13:52:07 +00001010 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001011 case SSTATE_NOT_INTERNED:
1012 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001013
Benjamin Peterson29060642009-01-31 22:14:21 +00001014 case SSTATE_INTERNED_MORTAL:
1015 /* revive dead object temporarily for DelItem */
1016 Py_REFCNT(unicode) = 3;
1017 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1018 Py_FatalError(
1019 "deletion of interned string failed");
1020 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001021
Benjamin Peterson29060642009-01-31 22:14:21 +00001022 case SSTATE_INTERNED_IMMORTAL:
1023 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001024
Benjamin Peterson29060642009-01-31 22:14:21 +00001025 default:
1026 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001027 }
1028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001029 if (_PyUnicode_WSTR(unicode) &&
1030 (!PyUnicode_IS_READY(unicode) ||
1031 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1032 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001033 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001034 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035
1036 if (PyUnicode_IS_COMPACT(unicode)) {
1037 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 }
1039 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001040 if (_PyUnicode_DATA_ANY(unicode))
1041 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001042 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 }
1044}
1045
Alexander Belopolsky40018472011-02-26 01:02:56 +00001046static int
1047_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001048{
1049 register PyUnicodeObject *v;
1050
1051 /* Argument checks */
1052 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001053 PyErr_BadInternalCall();
1054 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001055 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001056 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
1058 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001059 PyErr_BadInternalCall();
1060 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001061 }
1062
1063 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 possible since these are being shared.
1065 The same goes for new-representation unicode objects or objects which
1066 have already been readied.
1067 For these, we simply return a fresh copy with the same Unicode content.
1068 */
1069 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
1070 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
1071 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001072 PyUnicodeObject *w = _PyUnicode_New(length);
1073 if (w == NULL)
1074 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
1076 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +00001077 Py_DECREF(*unicode);
1078 *unicode = w;
1079 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001080 }
1081
1082 /* Note that we don't have to modify *unicode for unshared Unicode
1083 objects, since we can modify them in-place. */
1084 return unicode_resize(v, length);
1085}
1086
Alexander Belopolsky40018472011-02-26 01:02:56 +00001087int
1088PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001089{
1090 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
1091}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093static PyObject*
1094get_latin1_char(unsigned char ch)
1095{
Victor Stinnera464fc12011-10-02 20:39:30 +02001096 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001098 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099 if (!unicode)
1100 return NULL;
1101 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1102 unicode_latin1[ch] = unicode;
1103 }
1104 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001105 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106}
1107
Alexander Belopolsky40018472011-02-26 01:02:56 +00001108PyObject *
1109PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110{
1111 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001112 Py_UCS4 maxchar = 0;
1113 Py_ssize_t num_surrogates;
1114
1115 if (u == NULL)
1116 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001118 /* If the Unicode data is known at construction time, we can apply
1119 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 /* Optimization for empty strings */
1122 if (size == 0 && unicode_empty != NULL) {
1123 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001124 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001125 }
Tim Petersced69f82003-09-16 20:30:58 +00001126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 /* Single character Unicode objects in the Latin-1 range are
1128 shared when using this constructor */
1129 if (size == 1 && *u < 256)
1130 return get_latin1_char((unsigned char)*u);
1131
1132 /* If not empty and not single character, copy the Unicode data
1133 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001134 if (find_maxchar_surrogates(u, u + size,
1135 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136 return NULL;
1137
1138 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1139 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 if (!unicode)
1141 return NULL;
1142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143 switch (PyUnicode_KIND(unicode)) {
1144 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001145 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1147 break;
1148 case PyUnicode_2BYTE_KIND:
1149#if Py_UNICODE_SIZE == 2
1150 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1151#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001152 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1154#endif
1155 break;
1156 case PyUnicode_4BYTE_KIND:
1157#if SIZEOF_WCHAR_T == 2
1158 /* This is the only case which has to process surrogates, thus
1159 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001160 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161#else
1162 assert(num_surrogates == 0);
1163 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1164#endif
1165 break;
1166 default:
1167 assert(0 && "Impossible state");
1168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169
1170 return (PyObject *)unicode;
1171}
1172
Alexander Belopolsky40018472011-02-26 01:02:56 +00001173PyObject *
1174PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001175{
1176 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001177
Benjamin Peterson14339b62009-01-31 16:36:08 +00001178 if (size < 0) {
1179 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001180 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001181 return NULL;
1182 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001183
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001184 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001185 some optimizations which share commonly used objects.
1186 Also, this means the input must be UTF-8, so fall back to the
1187 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001188 if (u != NULL) {
1189
Benjamin Peterson29060642009-01-31 22:14:21 +00001190 /* Optimization for empty strings */
1191 if (size == 0 && unicode_empty != NULL) {
1192 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001193 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001195
1196 /* Single characters are shared when using this constructor.
1197 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001198 if (size == 1 && Py_CHARMASK(*u) < 128)
1199 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001200
1201 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001202 }
1203
Walter Dörwald55507312007-05-18 13:12:10 +00001204 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001205 if (!unicode)
1206 return NULL;
1207
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001208 return (PyObject *)unicode;
1209}
1210
Alexander Belopolsky40018472011-02-26 01:02:56 +00001211PyObject *
1212PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001213{
1214 size_t size = strlen(u);
1215 if (size > PY_SSIZE_T_MAX) {
1216 PyErr_SetString(PyExc_OverflowError, "input too long");
1217 return NULL;
1218 }
1219
1220 return PyUnicode_FromStringAndSize(u, size);
1221}
1222
Victor Stinnere57b1c02011-09-28 22:20:48 +02001223static PyObject*
1224_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001225{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 PyObject *res;
1227 unsigned char max = 127;
1228 Py_ssize_t i;
1229 for (i = 0; i < size; i++) {
1230 if (u[i] & 0x80) {
1231 max = 255;
1232 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001233 }
1234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 res = PyUnicode_New(size, max);
1236 if (!res)
1237 return NULL;
1238 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1239 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001240}
1241
Victor Stinnere57b1c02011-09-28 22:20:48 +02001242static PyObject*
1243_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244{
1245 PyObject *res;
1246 Py_UCS2 max = 0;
1247 Py_ssize_t i;
1248 for (i = 0; i < size; i++)
1249 if (u[i] > max)
1250 max = u[i];
1251 res = PyUnicode_New(size, max);
1252 if (!res)
1253 return NULL;
1254 if (max >= 256)
1255 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1256 else
1257 for (i = 0; i < size; i++)
1258 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1259 return res;
1260}
1261
Victor Stinnere57b1c02011-09-28 22:20:48 +02001262static PyObject*
1263_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264{
1265 PyObject *res;
1266 Py_UCS4 max = 0;
1267 Py_ssize_t i;
1268 for (i = 0; i < size; i++)
1269 if (u[i] > max)
1270 max = u[i];
1271 res = PyUnicode_New(size, max);
1272 if (!res)
1273 return NULL;
1274 if (max >= 0x10000)
1275 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1276 else {
1277 int kind = PyUnicode_KIND(res);
1278 void *data = PyUnicode_DATA(res);
1279 for (i = 0; i < size; i++)
1280 PyUnicode_WRITE(kind, data, i, u[i]);
1281 }
1282 return res;
1283}
1284
1285PyObject*
1286PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1287{
1288 switch(kind) {
1289 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001290 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001291 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001292 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001293 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001294 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001295 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001296 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 return NULL;
1298}
1299
Victor Stinner034f6cf2011-09-30 02:26:44 +02001300PyObject*
1301PyUnicode_Copy(PyObject *unicode)
1302{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001303 Py_ssize_t size;
1304 PyObject *copy;
1305 void *data;
1306
Victor Stinner034f6cf2011-09-30 02:26:44 +02001307 if (!PyUnicode_Check(unicode)) {
1308 PyErr_BadInternalCall();
1309 return NULL;
1310 }
1311 if (PyUnicode_READY(unicode))
1312 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001313
1314 size = PyUnicode_GET_LENGTH(unicode);
1315 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1316 if (!copy)
1317 return NULL;
1318 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1319
1320 data = PyUnicode_DATA(unicode);
1321 switch (PyUnicode_KIND(unicode))
1322 {
1323 case PyUnicode_1BYTE_KIND:
1324 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1325 break;
1326 case PyUnicode_2BYTE_KIND:
1327 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1328 break;
1329 case PyUnicode_4BYTE_KIND:
1330 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1331 break;
1332 default:
1333 assert(0);
1334 break;
1335 }
1336 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001337}
1338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339
Victor Stinnerbc603d12011-10-02 01:00:40 +02001340/* Widen Unicode objects to larger buffers. Don't write terminating null
1341 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342
1343void*
1344_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1345{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001346 Py_ssize_t len;
1347 void *result;
1348 unsigned int skind;
1349
1350 if (PyUnicode_READY(s))
1351 return NULL;
1352
1353 len = PyUnicode_GET_LENGTH(s);
1354 skind = PyUnicode_KIND(s);
1355 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1357 return NULL;
1358 }
1359 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001360 case PyUnicode_2BYTE_KIND:
1361 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1362 if (!result)
1363 return PyErr_NoMemory();
1364 assert(skind == PyUnicode_1BYTE_KIND);
1365 _PyUnicode_CONVERT_BYTES(
1366 Py_UCS1, Py_UCS2,
1367 PyUnicode_1BYTE_DATA(s),
1368 PyUnicode_1BYTE_DATA(s) + len,
1369 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001371 case PyUnicode_4BYTE_KIND:
1372 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1373 if (!result)
1374 return PyErr_NoMemory();
1375 if (skind == PyUnicode_2BYTE_KIND) {
1376 _PyUnicode_CONVERT_BYTES(
1377 Py_UCS2, Py_UCS4,
1378 PyUnicode_2BYTE_DATA(s),
1379 PyUnicode_2BYTE_DATA(s) + len,
1380 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001382 else {
1383 assert(skind == PyUnicode_1BYTE_KIND);
1384 _PyUnicode_CONVERT_BYTES(
1385 Py_UCS1, Py_UCS4,
1386 PyUnicode_1BYTE_DATA(s),
1387 PyUnicode_1BYTE_DATA(s) + len,
1388 result);
1389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001391 default:
1392 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001394 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 return NULL;
1396}
1397
1398static Py_UCS4*
1399as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1400 int copy_null)
1401{
1402 int kind;
1403 void *data;
1404 Py_ssize_t len, targetlen;
1405 if (PyUnicode_READY(string) == -1)
1406 return NULL;
1407 kind = PyUnicode_KIND(string);
1408 data = PyUnicode_DATA(string);
1409 len = PyUnicode_GET_LENGTH(string);
1410 targetlen = len;
1411 if (copy_null)
1412 targetlen++;
1413 if (!target) {
1414 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1415 PyErr_NoMemory();
1416 return NULL;
1417 }
1418 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1419 if (!target) {
1420 PyErr_NoMemory();
1421 return NULL;
1422 }
1423 }
1424 else {
1425 if (targetsize < targetlen) {
1426 PyErr_Format(PyExc_SystemError,
1427 "string is longer than the buffer");
1428 if (copy_null && 0 < targetsize)
1429 target[0] = 0;
1430 return NULL;
1431 }
1432 }
1433 if (kind != PyUnicode_4BYTE_KIND) {
1434 Py_ssize_t i;
1435 for (i = 0; i < len; i++)
1436 target[i] = PyUnicode_READ(kind, data, i);
1437 }
1438 else
1439 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1440 if (copy_null)
1441 target[len] = 0;
1442 return target;
1443}
1444
1445Py_UCS4*
1446PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1447 int copy_null)
1448{
1449 if (target == NULL || targetsize < 1) {
1450 PyErr_BadInternalCall();
1451 return NULL;
1452 }
1453 return as_ucs4(string, target, targetsize, copy_null);
1454}
1455
1456Py_UCS4*
1457PyUnicode_AsUCS4Copy(PyObject *string)
1458{
1459 return as_ucs4(string, NULL, 0, 1);
1460}
1461
1462#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001463
Alexander Belopolsky40018472011-02-26 01:02:56 +00001464PyObject *
1465PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001466{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001467 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001468 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001470 PyErr_BadInternalCall();
1471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001472 }
1473
Martin v. Löwis790465f2008-04-05 20:41:37 +00001474 if (size == -1) {
1475 size = wcslen(w);
1476 }
1477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001479}
1480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001482
Walter Dörwald346737f2007-05-31 10:44:43 +00001483static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001484makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1485 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001486{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001487 *fmt++ = '%';
1488 if (width) {
1489 if (zeropad)
1490 *fmt++ = '0';
1491 fmt += sprintf(fmt, "%d", width);
1492 }
1493 if (precision)
1494 fmt += sprintf(fmt, ".%d", precision);
1495 if (longflag)
1496 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001497 else if (longlongflag) {
1498 /* longlongflag should only ever be nonzero on machines with
1499 HAVE_LONG_LONG defined */
1500#ifdef HAVE_LONG_LONG
1501 char *f = PY_FORMAT_LONG_LONG;
1502 while (*f)
1503 *fmt++ = *f++;
1504#else
1505 /* we shouldn't ever get here */
1506 assert(0);
1507 *fmt++ = 'l';
1508#endif
1509 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001510 else if (size_tflag) {
1511 char *f = PY_FORMAT_SIZE_T;
1512 while (*f)
1513 *fmt++ = *f++;
1514 }
1515 *fmt++ = c;
1516 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001517}
1518
Victor Stinner96865452011-03-01 23:44:09 +00001519/* helper for PyUnicode_FromFormatV() */
1520
1521static const char*
1522parse_format_flags(const char *f,
1523 int *p_width, int *p_precision,
1524 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1525{
1526 int width, precision, longflag, longlongflag, size_tflag;
1527
1528 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1529 f++;
1530 width = 0;
1531 while (Py_ISDIGIT((unsigned)*f))
1532 width = (width*10) + *f++ - '0';
1533 precision = 0;
1534 if (*f == '.') {
1535 f++;
1536 while (Py_ISDIGIT((unsigned)*f))
1537 precision = (precision*10) + *f++ - '0';
1538 if (*f == '%') {
1539 /* "%.3%s" => f points to "3" */
1540 f--;
1541 }
1542 }
1543 if (*f == '\0') {
1544 /* bogus format "%.1" => go backward, f points to "1" */
1545 f--;
1546 }
1547 if (p_width != NULL)
1548 *p_width = width;
1549 if (p_precision != NULL)
1550 *p_precision = precision;
1551
1552 /* Handle %ld, %lu, %lld and %llu. */
1553 longflag = 0;
1554 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001555 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001556
1557 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001558 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001559 longflag = 1;
1560 ++f;
1561 }
1562#ifdef HAVE_LONG_LONG
1563 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001564 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001565 longlongflag = 1;
1566 f += 2;
1567 }
1568#endif
1569 }
1570 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001571 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001572 size_tflag = 1;
1573 ++f;
1574 }
1575 if (p_longflag != NULL)
1576 *p_longflag = longflag;
1577 if (p_longlongflag != NULL)
1578 *p_longlongflag = longlongflag;
1579 if (p_size_tflag != NULL)
1580 *p_size_tflag = size_tflag;
1581 return f;
1582}
1583
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001584/* maximum number of characters required for output of %ld. 21 characters
1585 allows for 64-bit integers (in decimal) and an optional sign. */
1586#define MAX_LONG_CHARS 21
1587/* maximum number of characters required for output of %lld.
1588 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1589 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1590#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1591
Walter Dörwaldd2034312007-05-18 16:29:38 +00001592PyObject *
1593PyUnicode_FromFormatV(const char *format, va_list vargs)
1594{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001595 va_list count;
1596 Py_ssize_t callcount = 0;
1597 PyObject **callresults = NULL;
1598 PyObject **callresult = NULL;
1599 Py_ssize_t n = 0;
1600 int width = 0;
1601 int precision = 0;
1602 int zeropad;
1603 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001605 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001606 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1608 Py_UCS4 argmaxchar;
1609 Py_ssize_t numbersize = 0;
1610 char *numberresults = NULL;
1611 char *numberresult = NULL;
1612 Py_ssize_t i;
1613 int kind;
1614 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001615
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001616 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001617 /* step 1: count the number of %S/%R/%A/%s format specifications
1618 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1619 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001620 * result in an array)
1621 * also esimate a upper bound for all the number formats in the string,
1622 * numbers will be formated in step 3 and be keept in a '\0'-separated
1623 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001624 for (f = format; *f; f++) {
1625 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001626 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1628 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1629 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1630 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001633#ifdef HAVE_LONG_LONG
1634 if (longlongflag) {
1635 if (width < MAX_LONG_LONG_CHARS)
1636 width = MAX_LONG_LONG_CHARS;
1637 }
1638 else
1639#endif
1640 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1641 including sign. Decimal takes the most space. This
1642 isn't enough for octal. If a width is specified we
1643 need more (which we allocate later). */
1644 if (width < MAX_LONG_CHARS)
1645 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646
1647 /* account for the size + '\0' to separate numbers
1648 inside of the numberresults buffer */
1649 numbersize += (width + 1);
1650 }
1651 }
1652 else if ((unsigned char)*f > 127) {
1653 PyErr_Format(PyExc_ValueError,
1654 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1655 "string, got a non-ASCII byte: 0x%02x",
1656 (unsigned char)*f);
1657 return NULL;
1658 }
1659 }
1660 /* step 2: allocate memory for the results of
1661 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1662 if (callcount) {
1663 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1664 if (!callresults) {
1665 PyErr_NoMemory();
1666 return NULL;
1667 }
1668 callresult = callresults;
1669 }
1670 /* step 2.5: allocate memory for the results of formating numbers */
1671 if (numbersize) {
1672 numberresults = PyObject_Malloc(numbersize);
1673 if (!numberresults) {
1674 PyErr_NoMemory();
1675 goto fail;
1676 }
1677 numberresult = numberresults;
1678 }
1679
1680 /* step 3: format numbers and figure out how large a buffer we need */
1681 for (f = format; *f; f++) {
1682 if (*f == '%') {
1683 const char* p;
1684 int longflag;
1685 int longlongflag;
1686 int size_tflag;
1687 int numprinted;
1688
1689 p = f;
1690 zeropad = (f[1] == '0');
1691 f = parse_format_flags(f, &width, &precision,
1692 &longflag, &longlongflag, &size_tflag);
1693 switch (*f) {
1694 case 'c':
1695 {
1696 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001697 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 n++;
1699 break;
1700 }
1701 case '%':
1702 n++;
1703 break;
1704 case 'i':
1705 case 'd':
1706 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1707 width, precision, *f);
1708 if (longflag)
1709 numprinted = sprintf(numberresult, fmt,
1710 va_arg(count, long));
1711#ifdef HAVE_LONG_LONG
1712 else if (longlongflag)
1713 numprinted = sprintf(numberresult, fmt,
1714 va_arg(count, PY_LONG_LONG));
1715#endif
1716 else if (size_tflag)
1717 numprinted = sprintf(numberresult, fmt,
1718 va_arg(count, Py_ssize_t));
1719 else
1720 numprinted = sprintf(numberresult, fmt,
1721 va_arg(count, int));
1722 n += numprinted;
1723 /* advance by +1 to skip over the '\0' */
1724 numberresult += (numprinted + 1);
1725 assert(*(numberresult - 1) == '\0');
1726 assert(*(numberresult - 2) != '\0');
1727 assert(numprinted >= 0);
1728 assert(numberresult <= numberresults + numbersize);
1729 break;
1730 case 'u':
1731 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1732 width, precision, 'u');
1733 if (longflag)
1734 numprinted = sprintf(numberresult, fmt,
1735 va_arg(count, unsigned long));
1736#ifdef HAVE_LONG_LONG
1737 else if (longlongflag)
1738 numprinted = sprintf(numberresult, fmt,
1739 va_arg(count, unsigned PY_LONG_LONG));
1740#endif
1741 else if (size_tflag)
1742 numprinted = sprintf(numberresult, fmt,
1743 va_arg(count, size_t));
1744 else
1745 numprinted = sprintf(numberresult, fmt,
1746 va_arg(count, unsigned int));
1747 n += numprinted;
1748 numberresult += (numprinted + 1);
1749 assert(*(numberresult - 1) == '\0');
1750 assert(*(numberresult - 2) != '\0');
1751 assert(numprinted >= 0);
1752 assert(numberresult <= numberresults + numbersize);
1753 break;
1754 case 'x':
1755 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1756 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1757 n += numprinted;
1758 numberresult += (numprinted + 1);
1759 assert(*(numberresult - 1) == '\0');
1760 assert(*(numberresult - 2) != '\0');
1761 assert(numprinted >= 0);
1762 assert(numberresult <= numberresults + numbersize);
1763 break;
1764 case 'p':
1765 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1766 /* %p is ill-defined: ensure leading 0x. */
1767 if (numberresult[1] == 'X')
1768 numberresult[1] = 'x';
1769 else if (numberresult[1] != 'x') {
1770 memmove(numberresult + 2, numberresult,
1771 strlen(numberresult) + 1);
1772 numberresult[0] = '0';
1773 numberresult[1] = 'x';
1774 numprinted += 2;
1775 }
1776 n += numprinted;
1777 numberresult += (numprinted + 1);
1778 assert(*(numberresult - 1) == '\0');
1779 assert(*(numberresult - 2) != '\0');
1780 assert(numprinted >= 0);
1781 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001782 break;
1783 case 's':
1784 {
1785 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001786 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001787 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1788 if (!str)
1789 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 /* since PyUnicode_DecodeUTF8 returns already flexible
1791 unicode objects, there is no need to call ready on them */
1792 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001793 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001795 /* Remember the str and switch to the next slot */
1796 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001797 break;
1798 }
1799 case 'U':
1800 {
1801 PyObject *obj = va_arg(count, PyObject *);
1802 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 if (PyUnicode_READY(obj) == -1)
1804 goto fail;
1805 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001806 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001808 break;
1809 }
1810 case 'V':
1811 {
1812 PyObject *obj = va_arg(count, PyObject *);
1813 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001814 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001815 assert(obj || str);
1816 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001817 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 if (PyUnicode_READY(obj) == -1)
1819 goto fail;
1820 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001821 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001823 *callresult++ = NULL;
1824 }
1825 else {
1826 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1827 if (!str_obj)
1828 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001830 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001832 *callresult++ = str_obj;
1833 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001834 break;
1835 }
1836 case 'S':
1837 {
1838 PyObject *obj = va_arg(count, PyObject *);
1839 PyObject *str;
1840 assert(obj);
1841 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001843 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001845 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001847 /* Remember the str and switch to the next slot */
1848 *callresult++ = str;
1849 break;
1850 }
1851 case 'R':
1852 {
1853 PyObject *obj = va_arg(count, PyObject *);
1854 PyObject *repr;
1855 assert(obj);
1856 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001858 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001859 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001860 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001861 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001862 /* Remember the repr and switch to the next slot */
1863 *callresult++ = repr;
1864 break;
1865 }
1866 case 'A':
1867 {
1868 PyObject *obj = va_arg(count, PyObject *);
1869 PyObject *ascii;
1870 assert(obj);
1871 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001872 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001873 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001875 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001877 /* Remember the repr and switch to the next slot */
1878 *callresult++ = ascii;
1879 break;
1880 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001881 default:
1882 /* if we stumble upon an unknown
1883 formatting code, copy the rest of
1884 the format string to the output
1885 string. (we cannot just skip the
1886 code, since there's no way to know
1887 what's in the argument list) */
1888 n += strlen(p);
1889 goto expand;
1890 }
1891 } else
1892 n++;
1893 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001894 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001895 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001897 we don't have to resize the string.
1898 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001899 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001900 if (!string)
1901 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 kind = PyUnicode_KIND(string);
1903 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001904 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001908 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001909 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001910
1911 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1913 /* checking for == because the last argument could be a empty
1914 string, which causes i to point to end, the assert at the end of
1915 the loop */
1916 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001917
Benjamin Peterson14339b62009-01-31 16:36:08 +00001918 switch (*f) {
1919 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001920 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 const int ordinal = va_arg(vargs, int);
1922 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001923 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001924 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001925 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001926 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001927 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001928 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 case 'p':
1930 /* unused, since we already have the result */
1931 if (*f == 'p')
1932 (void) va_arg(vargs, void *);
1933 else
1934 (void) va_arg(vargs, int);
1935 /* extract the result from numberresults and append. */
1936 for (; *numberresult; ++i, ++numberresult)
1937 PyUnicode_WRITE(kind, data, i, *numberresult);
1938 /* skip over the separating '\0' */
1939 assert(*numberresult == '\0');
1940 numberresult++;
1941 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001942 break;
1943 case 's':
1944 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001945 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001947 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 size = PyUnicode_GET_LENGTH(*callresult);
1949 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001950 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1951 *callresult, 0,
1952 size) < 0)
1953 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001955 /* We're done with the unicode()/repr() => forget it */
1956 Py_DECREF(*callresult);
1957 /* switch to next unicode()/repr() result */
1958 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001959 break;
1960 }
1961 case 'U':
1962 {
1963 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 Py_ssize_t size;
1965 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1966 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001967 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1968 obj, 0,
1969 size) < 0)
1970 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001971 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001972 break;
1973 }
1974 case 'V':
1975 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001976 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001977 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001978 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001979 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980 size = PyUnicode_GET_LENGTH(obj);
1981 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001982 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1983 obj, 0,
1984 size) < 0)
1985 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001987 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 size = PyUnicode_GET_LENGTH(*callresult);
1989 assert(PyUnicode_KIND(*callresult) <=
1990 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001991 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1992 *callresult,
1993 0, size) < 0)
1994 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001996 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001997 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001998 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001999 break;
2000 }
2001 case 'S':
2002 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002003 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002004 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002005 /* unused, since we already have the result */
2006 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002008 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2009 *callresult, 0,
2010 PyUnicode_GET_LENGTH(*callresult)) < 0)
2011 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002013 /* We're done with the unicode()/repr() => forget it */
2014 Py_DECREF(*callresult);
2015 /* switch to next unicode()/repr() result */
2016 ++callresult;
2017 break;
2018 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002019 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002021 break;
2022 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 for (; *p; ++p, ++i)
2024 PyUnicode_WRITE(kind, data, i, *p);
2025 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002026 goto end;
2027 }
Victor Stinner1205f272010-09-11 00:54:47 +00002028 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002029 else {
2030 assert(i < PyUnicode_GET_LENGTH(string));
2031 PyUnicode_WRITE(kind, data, i++, *f);
2032 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002035
Benjamin Peterson29060642009-01-31 22:14:21 +00002036 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002037 if (callresults)
2038 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 if (numberresults)
2040 PyObject_Free(numberresults);
2041 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002042 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002043 if (callresults) {
2044 PyObject **callresult2 = callresults;
2045 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002046 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002047 ++callresult2;
2048 }
2049 PyObject_Free(callresults);
2050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 if (numberresults)
2052 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002053 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002054}
2055
Walter Dörwaldd2034312007-05-18 16:29:38 +00002056PyObject *
2057PyUnicode_FromFormat(const char *format, ...)
2058{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002059 PyObject* ret;
2060 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002061
2062#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002063 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002064#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002065 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002066#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002067 ret = PyUnicode_FromFormatV(format, vargs);
2068 va_end(vargs);
2069 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002070}
2071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002072#ifdef HAVE_WCHAR_H
2073
Victor Stinner5593d8a2010-10-02 11:11:27 +00002074/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2075 convert a Unicode object to a wide character string.
2076
Victor Stinnerd88d9832011-09-06 02:00:05 +02002077 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002078 character) required to convert the unicode object. Ignore size argument.
2079
Victor Stinnerd88d9832011-09-06 02:00:05 +02002080 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002081 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002082 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002083static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002084unicode_aswidechar(PyUnicodeObject *unicode,
2085 wchar_t *w,
2086 Py_ssize_t size)
2087{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002088 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 const wchar_t *wstr;
2090
2091 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2092 if (wstr == NULL)
2093 return -1;
2094
Victor Stinner5593d8a2010-10-02 11:11:27 +00002095 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002096 if (size > res)
2097 size = res + 1;
2098 else
2099 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002101 return res;
2102 }
2103 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002104 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002105}
2106
2107Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002108PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002109 wchar_t *w,
2110 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111{
2112 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002113 PyErr_BadInternalCall();
2114 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002116 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117}
2118
Victor Stinner137c34c2010-09-29 10:25:54 +00002119wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002120PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002121 Py_ssize_t *size)
2122{
2123 wchar_t* buffer;
2124 Py_ssize_t buflen;
2125
2126 if (unicode == NULL) {
2127 PyErr_BadInternalCall();
2128 return NULL;
2129 }
2130
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002131 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002132 if (buflen == -1)
2133 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002134 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002135 PyErr_NoMemory();
2136 return NULL;
2137 }
2138
Victor Stinner137c34c2010-09-29 10:25:54 +00002139 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2140 if (buffer == NULL) {
2141 PyErr_NoMemory();
2142 return NULL;
2143 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002144 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 if (buflen == -1)
2146 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002147 if (size != NULL)
2148 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002149 return buffer;
2150}
2151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002152#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153
Alexander Belopolsky40018472011-02-26 01:02:56 +00002154PyObject *
2155PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002158 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002159 PyErr_SetString(PyExc_ValueError,
2160 "chr() arg not in range(0x110000)");
2161 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002162 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002164 if (ordinal < 256)
2165 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167 v = PyUnicode_New(1, ordinal);
2168 if (v == NULL)
2169 return NULL;
2170 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2171 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002172}
2173
Alexander Belopolsky40018472011-02-26 01:02:56 +00002174PyObject *
2175PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002177 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002178 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002179 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002180 if (PyUnicode_READY(obj))
2181 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002182 Py_INCREF(obj);
2183 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002184 }
2185 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002186 /* For a Unicode subtype that's not a Unicode object,
2187 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002188 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002189 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002190 PyErr_Format(PyExc_TypeError,
2191 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002192 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002193 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002194}
2195
Alexander Belopolsky40018472011-02-26 01:02:56 +00002196PyObject *
2197PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002198 const char *encoding,
2199 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002200{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002201 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002202 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002203
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002205 PyErr_BadInternalCall();
2206 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002208
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002209 /* Decoding bytes objects is the most common case and should be fast */
2210 if (PyBytes_Check(obj)) {
2211 if (PyBytes_GET_SIZE(obj) == 0) {
2212 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002213 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002214 }
2215 else {
2216 v = PyUnicode_Decode(
2217 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2218 encoding, errors);
2219 }
2220 return v;
2221 }
2222
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002223 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002224 PyErr_SetString(PyExc_TypeError,
2225 "decoding str is not supported");
2226 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002227 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002228
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002229 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2230 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2231 PyErr_Format(PyExc_TypeError,
2232 "coercing to str: need bytes, bytearray "
2233 "or buffer-like object, %.80s found",
2234 Py_TYPE(obj)->tp_name);
2235 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002236 }
Tim Petersced69f82003-09-16 20:30:58 +00002237
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002238 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002239 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002240 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 }
Tim Petersced69f82003-09-16 20:30:58 +00002242 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002243 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002244
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002245 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002246 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247}
2248
Victor Stinner600d3be2010-06-10 12:00:55 +00002249/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002250 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2251 1 on success. */
2252static int
2253normalize_encoding(const char *encoding,
2254 char *lower,
2255 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002257 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002258 char *l;
2259 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002260
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002261 e = encoding;
2262 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002263 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002264 while (*e) {
2265 if (l == l_end)
2266 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002267 if (Py_ISUPPER(*e)) {
2268 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002269 }
2270 else if (*e == '_') {
2271 *l++ = '-';
2272 e++;
2273 }
2274 else {
2275 *l++ = *e++;
2276 }
2277 }
2278 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002279 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002280}
2281
Alexander Belopolsky40018472011-02-26 01:02:56 +00002282PyObject *
2283PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002284 Py_ssize_t size,
2285 const char *encoding,
2286 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002287{
2288 PyObject *buffer = NULL, *unicode;
2289 Py_buffer info;
2290 char lower[11]; /* Enough for any encoding shortcut */
2291
2292 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002293 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002294
2295 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002296 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002297 if ((strcmp(lower, "utf-8") == 0) ||
2298 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002299 return PyUnicode_DecodeUTF8(s, size, errors);
2300 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002301 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002302 (strcmp(lower, "iso-8859-1") == 0))
2303 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002304#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002305 else if (strcmp(lower, "mbcs") == 0)
2306 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002307#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002308 else if (strcmp(lower, "ascii") == 0)
2309 return PyUnicode_DecodeASCII(s, size, errors);
2310 else if (strcmp(lower, "utf-16") == 0)
2311 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2312 else if (strcmp(lower, "utf-32") == 0)
2313 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315
2316 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002317 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002318 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002319 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002320 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002321 if (buffer == NULL)
2322 goto onError;
2323 unicode = PyCodec_Decode(buffer, encoding, errors);
2324 if (unicode == NULL)
2325 goto onError;
2326 if (!PyUnicode_Check(unicode)) {
2327 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002328 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002329 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330 Py_DECREF(unicode);
2331 goto onError;
2332 }
2333 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002334 if (PyUnicode_READY(unicode)) {
2335 Py_DECREF(unicode);
2336 return NULL;
2337 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002338 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002339
Benjamin Peterson29060642009-01-31 22:14:21 +00002340 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002341 Py_XDECREF(buffer);
2342 return NULL;
2343}
2344
Alexander Belopolsky40018472011-02-26 01:02:56 +00002345PyObject *
2346PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002347 const char *encoding,
2348 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002349{
2350 PyObject *v;
2351
2352 if (!PyUnicode_Check(unicode)) {
2353 PyErr_BadArgument();
2354 goto onError;
2355 }
2356
2357 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002358 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002359
2360 /* Decode via the codec registry */
2361 v = PyCodec_Decode(unicode, encoding, errors);
2362 if (v == NULL)
2363 goto onError;
2364 return v;
2365
Benjamin Peterson29060642009-01-31 22:14:21 +00002366 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002367 return NULL;
2368}
2369
Alexander Belopolsky40018472011-02-26 01:02:56 +00002370PyObject *
2371PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002372 const char *encoding,
2373 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002374{
2375 PyObject *v;
2376
2377 if (!PyUnicode_Check(unicode)) {
2378 PyErr_BadArgument();
2379 goto onError;
2380 }
2381
2382 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002383 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002384
2385 /* Decode via the codec registry */
2386 v = PyCodec_Decode(unicode, encoding, errors);
2387 if (v == NULL)
2388 goto onError;
2389 if (!PyUnicode_Check(v)) {
2390 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002391 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002392 Py_TYPE(v)->tp_name);
2393 Py_DECREF(v);
2394 goto onError;
2395 }
2396 return v;
2397
Benjamin Peterson29060642009-01-31 22:14:21 +00002398 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002399 return NULL;
2400}
2401
Alexander Belopolsky40018472011-02-26 01:02:56 +00002402PyObject *
2403PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002404 Py_ssize_t size,
2405 const char *encoding,
2406 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002407{
2408 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002409
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410 unicode = PyUnicode_FromUnicode(s, size);
2411 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2414 Py_DECREF(unicode);
2415 return v;
2416}
2417
Alexander Belopolsky40018472011-02-26 01:02:56 +00002418PyObject *
2419PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002420 const char *encoding,
2421 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002422{
2423 PyObject *v;
2424
2425 if (!PyUnicode_Check(unicode)) {
2426 PyErr_BadArgument();
2427 goto onError;
2428 }
2429
2430 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002431 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002432
2433 /* Encode via the codec registry */
2434 v = PyCodec_Encode(unicode, encoding, errors);
2435 if (v == NULL)
2436 goto onError;
2437 return v;
2438
Benjamin Peterson29060642009-01-31 22:14:21 +00002439 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002440 return NULL;
2441}
2442
Victor Stinnerad158722010-10-27 00:25:46 +00002443PyObject *
2444PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002445{
Victor Stinner99b95382011-07-04 14:23:54 +02002446#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002447 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2448 PyUnicode_GET_SIZE(unicode),
2449 NULL);
2450#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002452#else
Victor Stinner793b5312011-04-27 00:24:21 +02002453 PyInterpreterState *interp = PyThreadState_GET()->interp;
2454 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2455 cannot use it to encode and decode filenames before it is loaded. Load
2456 the Python codec requires to encode at least its own filename. Use the C
2457 version of the locale codec until the codec registry is initialized and
2458 the Python codec is loaded.
2459
2460 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2461 cannot only rely on it: check also interp->fscodec_initialized for
2462 subinterpreters. */
2463 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002464 return PyUnicode_AsEncodedString(unicode,
2465 Py_FileSystemDefaultEncoding,
2466 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002467 }
2468 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002469 /* locale encoding with surrogateescape */
2470 wchar_t *wchar;
2471 char *bytes;
2472 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002473 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002474
2475 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2476 if (wchar == NULL)
2477 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002478 bytes = _Py_wchar2char(wchar, &error_pos);
2479 if (bytes == NULL) {
2480 if (error_pos != (size_t)-1) {
2481 char *errmsg = strerror(errno);
2482 PyObject *exc = NULL;
2483 if (errmsg == NULL)
2484 errmsg = "Py_wchar2char() failed";
2485 raise_encode_exception(&exc,
2486 "filesystemencoding",
2487 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2488 error_pos, error_pos+1,
2489 errmsg);
2490 Py_XDECREF(exc);
2491 }
2492 else
2493 PyErr_NoMemory();
2494 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002495 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002496 }
2497 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002498
2499 bytes_obj = PyBytes_FromString(bytes);
2500 PyMem_Free(bytes);
2501 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002502 }
Victor Stinnerad158722010-10-27 00:25:46 +00002503#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002504}
2505
Alexander Belopolsky40018472011-02-26 01:02:56 +00002506PyObject *
2507PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002508 const char *encoding,
2509 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510{
2511 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002512 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002513
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514 if (!PyUnicode_Check(unicode)) {
2515 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517 }
Fred Drakee4315f52000-05-09 19:53:39 +00002518
Victor Stinner2f283c22011-03-02 01:21:46 +00002519 if (encoding == NULL) {
2520 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002522 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002524 }
Fred Drakee4315f52000-05-09 19:53:39 +00002525
2526 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002527 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002528 if ((strcmp(lower, "utf-8") == 0) ||
2529 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002530 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002531 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002532 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002533 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002535 }
Victor Stinner37296e82010-06-10 13:36:23 +00002536 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002537 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002538 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002540#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002541 else if (strcmp(lower, "mbcs") == 0)
2542 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2543 PyUnicode_GET_SIZE(unicode),
2544 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002545#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002546 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002547 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002548 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549
2550 /* Encode via the codec registry */
2551 v = PyCodec_Encode(unicode, encoding, errors);
2552 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002553 return NULL;
2554
2555 /* The normal path */
2556 if (PyBytes_Check(v))
2557 return v;
2558
2559 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002560 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002561 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002562 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002563
2564 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2565 "encoder %s returned bytearray instead of bytes",
2566 encoding);
2567 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002568 Py_DECREF(v);
2569 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002570 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002571
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002572 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2573 Py_DECREF(v);
2574 return b;
2575 }
2576
2577 PyErr_Format(PyExc_TypeError,
2578 "encoder did not return a bytes object (type=%.400s)",
2579 Py_TYPE(v)->tp_name);
2580 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002581 return NULL;
2582}
2583
Alexander Belopolsky40018472011-02-26 01:02:56 +00002584PyObject *
2585PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002586 const char *encoding,
2587 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002588{
2589 PyObject *v;
2590
2591 if (!PyUnicode_Check(unicode)) {
2592 PyErr_BadArgument();
2593 goto onError;
2594 }
2595
2596 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002597 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002598
2599 /* Encode via the codec registry */
2600 v = PyCodec_Encode(unicode, encoding, errors);
2601 if (v == NULL)
2602 goto onError;
2603 if (!PyUnicode_Check(v)) {
2604 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002605 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002606 Py_TYPE(v)->tp_name);
2607 Py_DECREF(v);
2608 goto onError;
2609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002611
Benjamin Peterson29060642009-01-31 22:14:21 +00002612 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613 return NULL;
2614}
2615
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002616PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002617PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002618 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002619 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2620}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002621
Christian Heimes5894ba72007-11-04 11:43:14 +00002622PyObject*
2623PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2624{
Victor Stinner99b95382011-07-04 14:23:54 +02002625#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002626 return PyUnicode_DecodeMBCS(s, size, NULL);
2627#elif defined(__APPLE__)
2628 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2629#else
Victor Stinner793b5312011-04-27 00:24:21 +02002630 PyInterpreterState *interp = PyThreadState_GET()->interp;
2631 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2632 cannot use it to encode and decode filenames before it is loaded. Load
2633 the Python codec requires to encode at least its own filename. Use the C
2634 version of the locale codec until the codec registry is initialized and
2635 the Python codec is loaded.
2636
2637 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2638 cannot only rely on it: check also interp->fscodec_initialized for
2639 subinterpreters. */
2640 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002641 return PyUnicode_Decode(s, size,
2642 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002643 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002644 }
2645 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002646 /* locale encoding with surrogateescape */
2647 wchar_t *wchar;
2648 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002649 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002650
2651 if (s[size] != '\0' || size != strlen(s)) {
2652 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2653 return NULL;
2654 }
2655
Victor Stinner168e1172010-10-16 23:16:16 +00002656 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002657 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002658 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002659
Victor Stinner168e1172010-10-16 23:16:16 +00002660 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002661 PyMem_Free(wchar);
2662 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002663 }
Victor Stinnerad158722010-10-27 00:25:46 +00002664#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002665}
2666
Martin v. Löwis011e8422009-05-05 04:43:17 +00002667
2668int
2669PyUnicode_FSConverter(PyObject* arg, void* addr)
2670{
2671 PyObject *output = NULL;
2672 Py_ssize_t size;
2673 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002674 if (arg == NULL) {
2675 Py_DECREF(*(PyObject**)addr);
2676 return 1;
2677 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002678 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002679 output = arg;
2680 Py_INCREF(output);
2681 }
2682 else {
2683 arg = PyUnicode_FromObject(arg);
2684 if (!arg)
2685 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002686 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002687 Py_DECREF(arg);
2688 if (!output)
2689 return 0;
2690 if (!PyBytes_Check(output)) {
2691 Py_DECREF(output);
2692 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2693 return 0;
2694 }
2695 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002696 size = PyBytes_GET_SIZE(output);
2697 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002698 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002699 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002700 Py_DECREF(output);
2701 return 0;
2702 }
2703 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002704 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002705}
2706
2707
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002708int
2709PyUnicode_FSDecoder(PyObject* arg, void* addr)
2710{
2711 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002712 if (arg == NULL) {
2713 Py_DECREF(*(PyObject**)addr);
2714 return 1;
2715 }
2716 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002717 if (PyUnicode_READY(arg))
2718 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002719 output = arg;
2720 Py_INCREF(output);
2721 }
2722 else {
2723 arg = PyBytes_FromObject(arg);
2724 if (!arg)
2725 return 0;
2726 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2727 PyBytes_GET_SIZE(arg));
2728 Py_DECREF(arg);
2729 if (!output)
2730 return 0;
2731 if (!PyUnicode_Check(output)) {
2732 Py_DECREF(output);
2733 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2734 return 0;
2735 }
2736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002737 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2738 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002739 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2740 Py_DECREF(output);
2741 return 0;
2742 }
2743 *(PyObject**)addr = output;
2744 return Py_CLEANUP_SUPPORTED;
2745}
2746
2747
Martin v. Löwis5b222132007-06-10 09:51:05 +00002748char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002749PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002750{
Christian Heimesf3863112007-11-22 07:46:41 +00002751 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002752 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2753
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002754 if (!PyUnicode_Check(unicode)) {
2755 PyErr_BadArgument();
2756 return NULL;
2757 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002758 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002759 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002760
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002761 if (PyUnicode_UTF8(unicode) == NULL) {
2762 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002763 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2764 if (bytes == NULL)
2765 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002766 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2767 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002768 Py_DECREF(bytes);
2769 return NULL;
2770 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002771 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2772 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 Py_DECREF(bytes);
2774 }
2775
2776 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002777 *psize = PyUnicode_UTF8_LENGTH(unicode);
2778 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002779}
2780
2781char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002782PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002783{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002784 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2785}
2786
2787#ifdef Py_DEBUG
2788int unicode_as_unicode_calls = 0;
2789#endif
2790
2791
2792Py_UNICODE *
2793PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2794{
2795 PyUnicodeObject *u;
2796 const unsigned char *one_byte;
2797#if SIZEOF_WCHAR_T == 4
2798 const Py_UCS2 *two_bytes;
2799#else
2800 const Py_UCS4 *four_bytes;
2801 const Py_UCS4 *ucs4_end;
2802 Py_ssize_t num_surrogates;
2803#endif
2804 wchar_t *w;
2805 wchar_t *wchar_end;
2806
2807 if (!PyUnicode_Check(unicode)) {
2808 PyErr_BadArgument();
2809 return NULL;
2810 }
2811 u = (PyUnicodeObject*)unicode;
2812 if (_PyUnicode_WSTR(u) == NULL) {
2813 /* Non-ASCII compact unicode object */
2814 assert(_PyUnicode_KIND(u) != 0);
2815 assert(PyUnicode_IS_READY(u));
2816
2817#ifdef Py_DEBUG
2818 ++unicode_as_unicode_calls;
2819#endif
2820
2821 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2822#if SIZEOF_WCHAR_T == 2
2823 four_bytes = PyUnicode_4BYTE_DATA(u);
2824 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2825 num_surrogates = 0;
2826
2827 for (; four_bytes < ucs4_end; ++four_bytes) {
2828 if (*four_bytes > 0xFFFF)
2829 ++num_surrogates;
2830 }
2831
2832 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2833 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2834 if (!_PyUnicode_WSTR(u)) {
2835 PyErr_NoMemory();
2836 return NULL;
2837 }
2838 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2839
2840 w = _PyUnicode_WSTR(u);
2841 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2842 four_bytes = PyUnicode_4BYTE_DATA(u);
2843 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2844 if (*four_bytes > 0xFFFF) {
2845 /* encode surrogate pair in this case */
2846 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2847 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2848 }
2849 else
2850 *w = *four_bytes;
2851
2852 if (w > wchar_end) {
2853 assert(0 && "Miscalculated string end");
2854 }
2855 }
2856 *w = 0;
2857#else
2858 /* sizeof(wchar_t) == 4 */
2859 Py_FatalError("Impossible unicode object state, wstr and str "
2860 "should share memory already.");
2861 return NULL;
2862#endif
2863 }
2864 else {
2865 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2866 (_PyUnicode_LENGTH(u) + 1));
2867 if (!_PyUnicode_WSTR(u)) {
2868 PyErr_NoMemory();
2869 return NULL;
2870 }
2871 if (!PyUnicode_IS_COMPACT_ASCII(u))
2872 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2873 w = _PyUnicode_WSTR(u);
2874 wchar_end = w + _PyUnicode_LENGTH(u);
2875
2876 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2877 one_byte = PyUnicode_1BYTE_DATA(u);
2878 for (; w < wchar_end; ++one_byte, ++w)
2879 *w = *one_byte;
2880 /* null-terminate the wstr */
2881 *w = 0;
2882 }
2883 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2884#if SIZEOF_WCHAR_T == 4
2885 two_bytes = PyUnicode_2BYTE_DATA(u);
2886 for (; w < wchar_end; ++two_bytes, ++w)
2887 *w = *two_bytes;
2888 /* null-terminate the wstr */
2889 *w = 0;
2890#else
2891 /* sizeof(wchar_t) == 2 */
2892 PyObject_FREE(_PyUnicode_WSTR(u));
2893 _PyUnicode_WSTR(u) = NULL;
2894 Py_FatalError("Impossible unicode object state, wstr "
2895 "and str should share memory already.");
2896 return NULL;
2897#endif
2898 }
2899 else {
2900 assert(0 && "This should never happen.");
2901 }
2902 }
2903 }
2904 if (size != NULL)
2905 *size = PyUnicode_WSTR_LENGTH(u);
2906 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002907}
2908
Alexander Belopolsky40018472011-02-26 01:02:56 +00002909Py_UNICODE *
2910PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002912 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002913}
2914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002915
Alexander Belopolsky40018472011-02-26 01:02:56 +00002916Py_ssize_t
2917PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918{
2919 if (!PyUnicode_Check(unicode)) {
2920 PyErr_BadArgument();
2921 goto onError;
2922 }
2923 return PyUnicode_GET_SIZE(unicode);
2924
Benjamin Peterson29060642009-01-31 22:14:21 +00002925 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 return -1;
2927}
2928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002929Py_ssize_t
2930PyUnicode_GetLength(PyObject *unicode)
2931{
Victor Stinner5a706cf2011-10-02 00:36:53 +02002932 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002933 PyErr_BadArgument();
2934 return -1;
2935 }
2936
2937 return PyUnicode_GET_LENGTH(unicode);
2938}
2939
2940Py_UCS4
2941PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2942{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02002943 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
2944 PyErr_BadArgument();
2945 return (Py_UCS4)-1;
2946 }
2947 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
2948 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002949 return (Py_UCS4)-1;
2950 }
2951 return PyUnicode_READ_CHAR(unicode, index);
2952}
2953
2954int
2955PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2956{
2957 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02002958 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002959 return -1;
2960 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02002961 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
2962 PyErr_SetString(PyExc_IndexError, "string index out of range");
2963 return -1;
2964 }
2965 if (_PyUnicode_Dirty(unicode))
2966 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002967 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2968 index, ch);
2969 return 0;
2970}
2971
Alexander Belopolsky40018472011-02-26 01:02:56 +00002972const char *
2973PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002974{
Victor Stinner42cb4622010-09-01 19:39:01 +00002975 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002976}
2977
Victor Stinner554f3f02010-06-16 23:33:54 +00002978/* create or adjust a UnicodeDecodeError */
2979static void
2980make_decode_exception(PyObject **exceptionObject,
2981 const char *encoding,
2982 const char *input, Py_ssize_t length,
2983 Py_ssize_t startpos, Py_ssize_t endpos,
2984 const char *reason)
2985{
2986 if (*exceptionObject == NULL) {
2987 *exceptionObject = PyUnicodeDecodeError_Create(
2988 encoding, input, length, startpos, endpos, reason);
2989 }
2990 else {
2991 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2992 goto onError;
2993 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2994 goto onError;
2995 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2996 goto onError;
2997 }
2998 return;
2999
3000onError:
3001 Py_DECREF(*exceptionObject);
3002 *exceptionObject = NULL;
3003}
3004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003005/* error handling callback helper:
3006 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003007 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003008 and adjust various state variables.
3009 return 0 on success, -1 on error
3010*/
3011
Alexander Belopolsky40018472011-02-26 01:02:56 +00003012static int
3013unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003014 const char *encoding, const char *reason,
3015 const char **input, const char **inend, Py_ssize_t *startinpos,
3016 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3017 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003018{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003019 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003020
3021 PyObject *restuple = NULL;
3022 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003023 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003024 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003025 Py_ssize_t requiredsize;
3026 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003027 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003028 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003029 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003030 int res = -1;
3031
3032 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 *errorHandler = PyCodec_LookupError(errors);
3034 if (*errorHandler == NULL)
3035 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003036 }
3037
Victor Stinner554f3f02010-06-16 23:33:54 +00003038 make_decode_exception(exceptionObject,
3039 encoding,
3040 *input, *inend - *input,
3041 *startinpos, *endinpos,
3042 reason);
3043 if (*exceptionObject == NULL)
3044 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003045
3046 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3047 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003048 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003049 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003050 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003051 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003052 }
3053 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003054 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003055
3056 /* Copy back the bytes variables, which might have been modified by the
3057 callback */
3058 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3059 if (!inputobj)
3060 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003061 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003062 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003063 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003064 *input = PyBytes_AS_STRING(inputobj);
3065 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003066 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003067 /* we can DECREF safely, as the exception has another reference,
3068 so the object won't go away. */
3069 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003071 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003072 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003073 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003074 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3075 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003076 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003077
3078 /* need more space? (at least enough for what we
3079 have+the replacement+the rest of the string (starting
3080 at the new input position), so we won't have to check space
3081 when there are no errors in the rest of the string) */
3082 repptr = PyUnicode_AS_UNICODE(repunicode);
3083 repsize = PyUnicode_GET_SIZE(repunicode);
3084 requiredsize = *outpos + repsize + insize-newpos;
3085 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003086 if (requiredsize<2*outsize)
3087 requiredsize = 2*outsize;
3088 if (_PyUnicode_Resize(output, requiredsize) < 0)
3089 goto onError;
3090 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 }
3092 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003093 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 Py_UNICODE_COPY(*outptr, repptr, repsize);
3095 *outptr += repsize;
3096 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003097
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003098 /* we made it! */
3099 res = 0;
3100
Benjamin Peterson29060642009-01-31 22:14:21 +00003101 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003102 Py_XDECREF(restuple);
3103 return res;
3104}
3105
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003106/* --- UTF-7 Codec -------------------------------------------------------- */
3107
Antoine Pitrou244651a2009-05-04 18:56:13 +00003108/* See RFC2152 for details. We encode conservatively and decode liberally. */
3109
3110/* Three simple macros defining base-64. */
3111
3112/* Is c a base-64 character? */
3113
3114#define IS_BASE64(c) \
3115 (((c) >= 'A' && (c) <= 'Z') || \
3116 ((c) >= 'a' && (c) <= 'z') || \
3117 ((c) >= '0' && (c) <= '9') || \
3118 (c) == '+' || (c) == '/')
3119
3120/* given that c is a base-64 character, what is its base-64 value? */
3121
3122#define FROM_BASE64(c) \
3123 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3124 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3125 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3126 (c) == '+' ? 62 : 63)
3127
3128/* What is the base-64 character of the bottom 6 bits of n? */
3129
3130#define TO_BASE64(n) \
3131 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3132
3133/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3134 * decoded as itself. We are permissive on decoding; the only ASCII
3135 * byte not decoding to itself is the + which begins a base64
3136 * string. */
3137
3138#define DECODE_DIRECT(c) \
3139 ((c) <= 127 && (c) != '+')
3140
3141/* The UTF-7 encoder treats ASCII characters differently according to
3142 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3143 * the above). See RFC2152. This array identifies these different
3144 * sets:
3145 * 0 : "Set D"
3146 * alphanumeric and '(),-./:?
3147 * 1 : "Set O"
3148 * !"#$%&*;<=>@[]^_`{|}
3149 * 2 : "whitespace"
3150 * ht nl cr sp
3151 * 3 : special (must be base64 encoded)
3152 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3153 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003154
Tim Petersced69f82003-09-16 20:30:58 +00003155static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003156char utf7_category[128] = {
3157/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3158 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3159/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3160 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3161/* sp ! " # $ % & ' ( ) * + , - . / */
3162 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3163/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3164 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3165/* @ A B C D E F G H I J K L M N O */
3166 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3167/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3168 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3169/* ` a b c d e f g h i j k l m n o */
3170 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3171/* p q r s t u v w x y z { | } ~ del */
3172 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003173};
3174
Antoine Pitrou244651a2009-05-04 18:56:13 +00003175/* ENCODE_DIRECT: this character should be encoded as itself. The
3176 * answer depends on whether we are encoding set O as itself, and also
3177 * on whether we are encoding whitespace as itself. RFC2152 makes it
3178 * clear that the answers to these questions vary between
3179 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003180
Antoine Pitrou244651a2009-05-04 18:56:13 +00003181#define ENCODE_DIRECT(c, directO, directWS) \
3182 ((c) < 128 && (c) > 0 && \
3183 ((utf7_category[(c)] == 0) || \
3184 (directWS && (utf7_category[(c)] == 2)) || \
3185 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003186
Alexander Belopolsky40018472011-02-26 01:02:56 +00003187PyObject *
3188PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003189 Py_ssize_t size,
3190 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003191{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003192 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3193}
3194
Antoine Pitrou244651a2009-05-04 18:56:13 +00003195/* The decoder. The only state we preserve is our read position,
3196 * i.e. how many characters we have consumed. So if we end in the
3197 * middle of a shift sequence we have to back off the read position
3198 * and the output to the beginning of the sequence, otherwise we lose
3199 * all the shift state (seen bits, number of bits seen, high
3200 * surrogate). */
3201
Alexander Belopolsky40018472011-02-26 01:02:56 +00003202PyObject *
3203PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003204 Py_ssize_t size,
3205 const char *errors,
3206 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003207{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003209 Py_ssize_t startinpos;
3210 Py_ssize_t endinpos;
3211 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003212 const char *e;
3213 PyUnicodeObject *unicode;
3214 Py_UNICODE *p;
3215 const char *errmsg = "";
3216 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003217 Py_UNICODE *shiftOutStart;
3218 unsigned int base64bits = 0;
3219 unsigned long base64buffer = 0;
3220 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 PyObject *errorHandler = NULL;
3222 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003223
3224 unicode = _PyUnicode_New(size);
3225 if (!unicode)
3226 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003227 if (size == 0) {
3228 if (consumed)
3229 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003230 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003231 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003233 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003234 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003235 e = s + size;
3236
3237 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003240 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003241
Antoine Pitrou244651a2009-05-04 18:56:13 +00003242 if (inShift) { /* in a base-64 section */
3243 if (IS_BASE64(ch)) { /* consume a base-64 character */
3244 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3245 base64bits += 6;
3246 s++;
3247 if (base64bits >= 16) {
3248 /* we have enough bits for a UTF-16 value */
3249 Py_UNICODE outCh = (Py_UNICODE)
3250 (base64buffer >> (base64bits-16));
3251 base64bits -= 16;
3252 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3253 if (surrogate) {
3254 /* expecting a second surrogate */
3255 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3256#ifdef Py_UNICODE_WIDE
3257 *p++ = (((surrogate & 0x3FF)<<10)
3258 | (outCh & 0x3FF)) + 0x10000;
3259#else
3260 *p++ = surrogate;
3261 *p++ = outCh;
3262#endif
3263 surrogate = 0;
3264 }
3265 else {
3266 surrogate = 0;
3267 errmsg = "second surrogate missing";
3268 goto utf7Error;
3269 }
3270 }
3271 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3272 /* first surrogate */
3273 surrogate = outCh;
3274 }
3275 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3276 errmsg = "unexpected second surrogate";
3277 goto utf7Error;
3278 }
3279 else {
3280 *p++ = outCh;
3281 }
3282 }
3283 }
3284 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003285 inShift = 0;
3286 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003287 if (surrogate) {
3288 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003289 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003290 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003291 if (base64bits > 0) { /* left-over bits */
3292 if (base64bits >= 6) {
3293 /* We've seen at least one base-64 character */
3294 errmsg = "partial character in shift sequence";
3295 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003296 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003297 else {
3298 /* Some bits remain; they should be zero */
3299 if (base64buffer != 0) {
3300 errmsg = "non-zero padding bits in shift sequence";
3301 goto utf7Error;
3302 }
3303 }
3304 }
3305 if (ch != '-') {
3306 /* '-' is absorbed; other terminating
3307 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003308 *p++ = ch;
3309 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003310 }
3311 }
3312 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003314 s++; /* consume '+' */
3315 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003316 s++;
3317 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003318 }
3319 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003320 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003321 shiftOutStart = p;
3322 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003323 }
3324 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003325 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003326 *p++ = ch;
3327 s++;
3328 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003329 else {
3330 startinpos = s-starts;
3331 s++;
3332 errmsg = "unexpected special character";
3333 goto utf7Error;
3334 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003335 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003336utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 outpos = p-PyUnicode_AS_UNICODE(unicode);
3338 endinpos = s-starts;
3339 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003340 errors, &errorHandler,
3341 "utf7", errmsg,
3342 &starts, &e, &startinpos, &endinpos, &exc, &s,
3343 &unicode, &outpos, &p))
3344 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003345 }
3346
Antoine Pitrou244651a2009-05-04 18:56:13 +00003347 /* end of string */
3348
3349 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3350 /* if we're in an inconsistent state, that's an error */
3351 if (surrogate ||
3352 (base64bits >= 6) ||
3353 (base64bits > 0 && base64buffer != 0)) {
3354 outpos = p-PyUnicode_AS_UNICODE(unicode);
3355 endinpos = size;
3356 if (unicode_decode_call_errorhandler(
3357 errors, &errorHandler,
3358 "utf7", "unterminated shift sequence",
3359 &starts, &e, &startinpos, &endinpos, &exc, &s,
3360 &unicode, &outpos, &p))
3361 goto onError;
3362 if (s < e)
3363 goto restart;
3364 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003365 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003366
3367 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003368 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003369 if (inShift) {
3370 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003371 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003372 }
3373 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003374 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003375 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003376 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003377
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003378 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003379 goto onError;
3380
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003381 Py_XDECREF(errorHandler);
3382 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003383 if (PyUnicode_READY(unicode) == -1) {
3384 Py_DECREF(unicode);
3385 return NULL;
3386 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003387 return (PyObject *)unicode;
3388
Benjamin Peterson29060642009-01-31 22:14:21 +00003389 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 Py_XDECREF(errorHandler);
3391 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003392 Py_DECREF(unicode);
3393 return NULL;
3394}
3395
3396
Alexander Belopolsky40018472011-02-26 01:02:56 +00003397PyObject *
3398PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003399 Py_ssize_t size,
3400 int base64SetO,
3401 int base64WhiteSpace,
3402 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003403{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003404 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003405 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003406 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003407 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003408 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003409 unsigned int base64bits = 0;
3410 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003411 char * out;
3412 char * start;
3413
3414 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003415 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003416
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003417 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003418 return PyErr_NoMemory();
3419
Antoine Pitrou244651a2009-05-04 18:56:13 +00003420 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003421 if (v == NULL)
3422 return NULL;
3423
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003424 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003425 for (;i < size; ++i) {
3426 Py_UNICODE ch = s[i];
3427
Antoine Pitrou244651a2009-05-04 18:56:13 +00003428 if (inShift) {
3429 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3430 /* shifting out */
3431 if (base64bits) { /* output remaining bits */
3432 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3433 base64buffer = 0;
3434 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003435 }
3436 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003437 /* Characters not in the BASE64 set implicitly unshift the sequence
3438 so no '-' is required, except if the character is itself a '-' */
3439 if (IS_BASE64(ch) || ch == '-') {
3440 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003441 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003442 *out++ = (char) ch;
3443 }
3444 else {
3445 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003446 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003447 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003448 else { /* not in a shift sequence */
3449 if (ch == '+') {
3450 *out++ = '+';
3451 *out++ = '-';
3452 }
3453 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3454 *out++ = (char) ch;
3455 }
3456 else {
3457 *out++ = '+';
3458 inShift = 1;
3459 goto encode_char;
3460 }
3461 }
3462 continue;
3463encode_char:
3464#ifdef Py_UNICODE_WIDE
3465 if (ch >= 0x10000) {
3466 /* code first surrogate */
3467 base64bits += 16;
3468 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3469 while (base64bits >= 6) {
3470 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3471 base64bits -= 6;
3472 }
3473 /* prepare second surrogate */
3474 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3475 }
3476#endif
3477 base64bits += 16;
3478 base64buffer = (base64buffer << 16) | ch;
3479 while (base64bits >= 6) {
3480 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3481 base64bits -= 6;
3482 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003483 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003484 if (base64bits)
3485 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3486 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003487 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003488 if (_PyBytes_Resize(&v, out - start) < 0)
3489 return NULL;
3490 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003491}
3492
Antoine Pitrou244651a2009-05-04 18:56:13 +00003493#undef IS_BASE64
3494#undef FROM_BASE64
3495#undef TO_BASE64
3496#undef DECODE_DIRECT
3497#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003498
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499/* --- UTF-8 Codec -------------------------------------------------------- */
3500
Tim Petersced69f82003-09-16 20:30:58 +00003501static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003503 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3504 illegal prefix. See RFC 3629 for details */
3505 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3506 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003507 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003508 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3509 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3510 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3511 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003512 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3513 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3515 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003516 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3517 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3518 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3519 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3520 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521};
3522
Alexander Belopolsky40018472011-02-26 01:02:56 +00003523PyObject *
3524PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003525 Py_ssize_t size,
3526 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527{
Walter Dörwald69652032004-09-07 20:24:22 +00003528 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3529}
3530
Antoine Pitrouab868312009-01-10 15:40:25 +00003531/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3532#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3533
3534/* Mask to quickly check whether a C 'long' contains a
3535 non-ASCII, UTF8-encoded char. */
3536#if (SIZEOF_LONG == 8)
3537# define ASCII_CHAR_MASK 0x8080808080808080L
3538#elif (SIZEOF_LONG == 4)
3539# define ASCII_CHAR_MASK 0x80808080L
3540#else
3541# error C 'long' size should be either 4 or 8!
3542#endif
3543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003544/* Scans a UTF-8 string and returns the maximum character to be expected,
3545 the size of the decoded unicode string and if any major errors were
3546 encountered.
3547
3548 This function does check basic UTF-8 sanity, it does however NOT CHECK
3549 if the string contains surrogates, and if all continuation bytes are
3550 within the correct ranges, these checks are performed in
3551 PyUnicode_DecodeUTF8Stateful.
3552
3553 If it sets has_errors to 1, it means the value of unicode_size and max_char
3554 will be bogus and you should not rely on useful information in them.
3555 */
3556static Py_UCS4
3557utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3558 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3559 int *has_errors)
3560{
3561 Py_ssize_t n;
3562 Py_ssize_t char_count = 0;
3563 Py_UCS4 max_char = 127, new_max;
3564 Py_UCS4 upper_bound;
3565 const unsigned char *p = (const unsigned char *)s;
3566 const unsigned char *end = p + string_size;
3567 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3568 int err = 0;
3569
3570 for (; p < end && !err; ++p, ++char_count) {
3571 /* Only check value if it's not a ASCII char... */
3572 if (*p < 0x80) {
3573 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3574 an explanation. */
3575 if (!((size_t) p & LONG_PTR_MASK)) {
3576 /* Help register allocation */
3577 register const unsigned char *_p = p;
3578 while (_p < aligned_end) {
3579 unsigned long value = *(unsigned long *) _p;
3580 if (value & ASCII_CHAR_MASK)
3581 break;
3582 _p += SIZEOF_LONG;
3583 char_count += SIZEOF_LONG;
3584 }
3585 p = _p;
3586 if (p == end)
3587 break;
3588 }
3589 }
3590 if (*p >= 0x80) {
3591 n = utf8_code_length[*p];
3592 new_max = max_char;
3593 switch (n) {
3594 /* invalid start byte */
3595 case 0:
3596 err = 1;
3597 break;
3598 case 2:
3599 /* Code points between 0x00FF and 0x07FF inclusive.
3600 Approximate the upper bound of the code point,
3601 if this flips over 255 we can be sure it will be more
3602 than 255 and the string will need 2 bytes per code coint,
3603 if it stays under or equal to 255, we can be sure 1 byte
3604 is enough.
3605 ((*p & 0b00011111) << 6) | 0b00111111 */
3606 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3607 if (max_char < upper_bound)
3608 new_max = upper_bound;
3609 /* Ensure we track at least that we left ASCII space. */
3610 if (new_max < 128)
3611 new_max = 128;
3612 break;
3613 case 3:
3614 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3615 always > 255 and <= 65535 and will always need 2 bytes. */
3616 if (max_char < 65535)
3617 new_max = 65535;
3618 break;
3619 case 4:
3620 /* Code point will be above 0xFFFF for sure in this case. */
3621 new_max = 65537;
3622 break;
3623 /* Internal error, this should be caught by the first if */
3624 case 1:
3625 default:
3626 assert(0 && "Impossible case in utf8_max_char_and_size");
3627 err = 1;
3628 }
3629 /* Instead of number of overall bytes for this code point,
3630 n containts the number of following bytes: */
3631 --n;
3632 /* Check if the follow up chars are all valid continuation bytes */
3633 if (n >= 1) {
3634 const unsigned char *cont;
3635 if ((p + n) >= end) {
3636 if (consumed == 0)
3637 /* incomplete data, non-incremental decoding */
3638 err = 1;
3639 break;
3640 }
3641 for (cont = p + 1; cont < (p + n); ++cont) {
3642 if ((*cont & 0xc0) != 0x80) {
3643 err = 1;
3644 break;
3645 }
3646 }
3647 p += n;
3648 }
3649 else
3650 err = 1;
3651 max_char = new_max;
3652 }
3653 }
3654
3655 if (unicode_size)
3656 *unicode_size = char_count;
3657 if (has_errors)
3658 *has_errors = err;
3659 return max_char;
3660}
3661
3662/* Similar to PyUnicode_WRITE but can also write into wstr field
3663 of the legacy unicode representation */
3664#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3665 do { \
3666 const int k_ = (kind); \
3667 if (k_ == PyUnicode_WCHAR_KIND) \
3668 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3669 else if (k_ == PyUnicode_1BYTE_KIND) \
3670 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3671 else if (k_ == PyUnicode_2BYTE_KIND) \
3672 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3673 else \
3674 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3675 } while (0)
3676
Alexander Belopolsky40018472011-02-26 01:02:56 +00003677PyObject *
3678PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003679 Py_ssize_t size,
3680 const char *errors,
3681 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003682{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003685 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003686 Py_ssize_t startinpos;
3687 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003688 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003690 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 PyObject *errorHandler = NULL;
3692 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003693 Py_UCS4 maxchar = 0;
3694 Py_ssize_t unicode_size;
3695 Py_ssize_t i;
3696 int kind;
3697 void *data;
3698 int has_errors;
3699 Py_UNICODE *error_outptr;
3700#if SIZEOF_WCHAR_T == 2
3701 Py_ssize_t wchar_offset = 0;
3702#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703
Walter Dörwald69652032004-09-07 20:24:22 +00003704 if (size == 0) {
3705 if (consumed)
3706 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003707 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003708 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3710 consumed, &has_errors);
3711 if (has_errors) {
3712 unicode = _PyUnicode_New(size);
3713 if (!unicode)
3714 return NULL;
3715 kind = PyUnicode_WCHAR_KIND;
3716 data = PyUnicode_AS_UNICODE(unicode);
3717 assert(data != NULL);
3718 }
3719 else {
3720 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3721 if (!unicode)
3722 return NULL;
3723 /* When the string is ASCII only, just use memcpy and return.
3724 unicode_size may be != size if there is an incomplete UTF-8
3725 sequence at the end of the ASCII block. */
3726 if (maxchar < 128 && size == unicode_size) {
3727 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3728 return (PyObject *)unicode;
3729 }
3730 kind = PyUnicode_KIND(unicode);
3731 data = PyUnicode_DATA(unicode);
3732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003736 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737
3738 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003739 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740
3741 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003742 /* Fast path for runs of ASCII characters. Given that common UTF-8
3743 input will consist of an overwhelming majority of ASCII
3744 characters, we try to optimize for this case by checking
3745 as many characters as a C 'long' can contain.
3746 First, check if we can do an aligned read, as most CPUs have
3747 a penalty for unaligned reads.
3748 */
3749 if (!((size_t) s & LONG_PTR_MASK)) {
3750 /* Help register allocation */
3751 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003753 while (_s < aligned_end) {
3754 /* Read a whole long at a time (either 4 or 8 bytes),
3755 and do a fast unrolled copy if it only contains ASCII
3756 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003757 unsigned long value = *(unsigned long *) _s;
3758 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003759 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003760 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3761 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3762 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3763 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003764#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3766 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3767 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3768 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003769#endif
3770 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003772 }
3773 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003774 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003775 if (s == e)
3776 break;
3777 ch = (unsigned char)*s;
3778 }
3779 }
3780
3781 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 s++;
3784 continue;
3785 }
3786
3787 n = utf8_code_length[ch];
3788
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003789 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003790 if (consumed)
3791 break;
3792 else {
3793 errmsg = "unexpected end of data";
3794 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003795 endinpos = startinpos+1;
3796 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3797 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003798 goto utf8Error;
3799 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801
3802 switch (n) {
3803
3804 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003805 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003806 startinpos = s-starts;
3807 endinpos = startinpos+1;
3808 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809
3810 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003811 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003812 startinpos = s-starts;
3813 endinpos = startinpos+1;
3814 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815
3816 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003817 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003818 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003819 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003820 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003821 goto utf8Error;
3822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003824 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003825 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 break;
3827
3828 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003829 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3830 will result in surrogates in range d800-dfff. Surrogates are
3831 not valid UTF-8 so they are rejected.
3832 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3833 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003834 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003835 (s[2] & 0xc0) != 0x80 ||
3836 ((unsigned char)s[0] == 0xE0 &&
3837 (unsigned char)s[1] < 0xA0) ||
3838 ((unsigned char)s[0] == 0xED &&
3839 (unsigned char)s[1] > 0x9F)) {
3840 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003841 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003842 endinpos = startinpos + 1;
3843
3844 /* if s[1] first two bits are 1 and 0, then the invalid
3845 continuation byte is s[2], so increment endinpos by 1,
3846 if not, s[1] is invalid and endinpos doesn't need to
3847 be incremented. */
3848 if ((s[1] & 0xC0) == 0x80)
3849 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003850 goto utf8Error;
3851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003853 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003855 break;
3856
3857 case 4:
3858 if ((s[1] & 0xc0) != 0x80 ||
3859 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003860 (s[3] & 0xc0) != 0x80 ||
3861 ((unsigned char)s[0] == 0xF0 &&
3862 (unsigned char)s[1] < 0x90) ||
3863 ((unsigned char)s[0] == 0xF4 &&
3864 (unsigned char)s[1] > 0x8F)) {
3865 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003866 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003867 endinpos = startinpos + 1;
3868 if ((s[1] & 0xC0) == 0x80) {
3869 endinpos++;
3870 if ((s[2] & 0xC0) == 0x80)
3871 endinpos++;
3872 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003873 goto utf8Error;
3874 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003875 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003876 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3877 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 /* If the string is flexible or we have native UCS-4, write
3880 directly.. */
3881 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3882 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884 else {
3885 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887 /* translate from 10000..10FFFF to 0..FFFF */
3888 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 /* high surrogate = top 10 bits added to D800 */
3891 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3892 (Py_UNICODE)(0xD800 + (ch >> 10)));
3893
3894 /* low surrogate = bottom 10 bits added to DC00 */
3895 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3896 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3897 }
3898#if SIZEOF_WCHAR_T == 2
3899 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003900#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 }
3903 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003904 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003905
Benjamin Peterson29060642009-01-31 22:14:21 +00003906 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 /* If this is not yet a resizable string, make it one.. */
3908 if (kind != PyUnicode_WCHAR_KIND) {
3909 const Py_UNICODE *u;
3910 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3911 if (!new_unicode)
3912 goto onError;
3913 u = PyUnicode_AsUnicode((PyObject *)unicode);
3914 if (!u)
3915 goto onError;
3916#if SIZEOF_WCHAR_T == 2
3917 i += wchar_offset;
3918#endif
3919 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3920 Py_DECREF(unicode);
3921 unicode = new_unicode;
3922 kind = 0;
3923 data = PyUnicode_AS_UNICODE(new_unicode);
3924 assert(data != NULL);
3925 }
3926 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003927 if (unicode_decode_call_errorhandler(
3928 errors, &errorHandler,
3929 "utf8", errmsg,
3930 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003932 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 /* Update data because unicode_decode_call_errorhandler might have
3934 re-created or resized the unicode object. */
3935 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003936 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003938 /* Ensure the unicode_size calculation above was correct: */
3939 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3940
Walter Dörwald69652032004-09-07 20:24:22 +00003941 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003942 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 /* Adjust length and ready string when it contained errors and
3945 is of the old resizable kind. */
3946 if (kind == PyUnicode_WCHAR_KIND) {
3947 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3948 PyUnicode_READY(unicode) == -1)
3949 goto onError;
3950 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 Py_XDECREF(errorHandler);
3953 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954 if (PyUnicode_READY(unicode) == -1) {
3955 Py_DECREF(unicode);
3956 return NULL;
3957 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 return (PyObject *)unicode;
3959
Benjamin Peterson29060642009-01-31 22:14:21 +00003960 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 Py_XDECREF(errorHandler);
3962 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 Py_DECREF(unicode);
3964 return NULL;
3965}
3966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003967#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003968
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003969#ifdef __APPLE__
3970
3971/* Simplified UTF-8 decoder using surrogateescape error handler,
3972 used to decode the command line arguments on Mac OS X. */
3973
3974wchar_t*
3975_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3976{
3977 int n;
3978 const char *e;
3979 wchar_t *unicode, *p;
3980
3981 /* Note: size will always be longer than the resulting Unicode
3982 character count */
3983 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3984 PyErr_NoMemory();
3985 return NULL;
3986 }
3987 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3988 if (!unicode)
3989 return NULL;
3990
3991 /* Unpack UTF-8 encoded data */
3992 p = unicode;
3993 e = s + size;
3994 while (s < e) {
3995 Py_UCS4 ch = (unsigned char)*s;
3996
3997 if (ch < 0x80) {
3998 *p++ = (wchar_t)ch;
3999 s++;
4000 continue;
4001 }
4002
4003 n = utf8_code_length[ch];
4004 if (s + n > e) {
4005 goto surrogateescape;
4006 }
4007
4008 switch (n) {
4009 case 0:
4010 case 1:
4011 goto surrogateescape;
4012
4013 case 2:
4014 if ((s[1] & 0xc0) != 0x80)
4015 goto surrogateescape;
4016 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4017 assert ((ch > 0x007F) && (ch <= 0x07FF));
4018 *p++ = (wchar_t)ch;
4019 break;
4020
4021 case 3:
4022 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4023 will result in surrogates in range d800-dfff. Surrogates are
4024 not valid UTF-8 so they are rejected.
4025 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4026 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4027 if ((s[1] & 0xc0) != 0x80 ||
4028 (s[2] & 0xc0) != 0x80 ||
4029 ((unsigned char)s[0] == 0xE0 &&
4030 (unsigned char)s[1] < 0xA0) ||
4031 ((unsigned char)s[0] == 0xED &&
4032 (unsigned char)s[1] > 0x9F)) {
4033
4034 goto surrogateescape;
4035 }
4036 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4037 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004039 break;
4040
4041 case 4:
4042 if ((s[1] & 0xc0) != 0x80 ||
4043 (s[2] & 0xc0) != 0x80 ||
4044 (s[3] & 0xc0) != 0x80 ||
4045 ((unsigned char)s[0] == 0xF0 &&
4046 (unsigned char)s[1] < 0x90) ||
4047 ((unsigned char)s[0] == 0xF4 &&
4048 (unsigned char)s[1] > 0x8F)) {
4049 goto surrogateescape;
4050 }
4051 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4052 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4053 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4054
4055#if SIZEOF_WCHAR_T == 4
4056 *p++ = (wchar_t)ch;
4057#else
4058 /* compute and append the two surrogates: */
4059
4060 /* translate from 10000..10FFFF to 0..FFFF */
4061 ch -= 0x10000;
4062
4063 /* high surrogate = top 10 bits added to D800 */
4064 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4065
4066 /* low surrogate = bottom 10 bits added to DC00 */
4067 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4068#endif
4069 break;
4070 }
4071 s += n;
4072 continue;
4073
4074 surrogateescape:
4075 *p++ = 0xDC00 + ch;
4076 s++;
4077 }
4078 *p = L'\0';
4079 return unicode;
4080}
4081
4082#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004084/* Primary internal function which creates utf8 encoded bytes objects.
4085
4086 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004087 and allocate exactly as much space needed at the end. Else allocate the
4088 maximum possible needed (4 result bytes per Unicode character), and return
4089 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004090*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004091PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093{
Tim Peters602f7402002-04-27 18:03:26 +00004094#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004095
Guido van Rossum98297ee2007-11-06 21:34:58 +00004096 Py_ssize_t i; /* index into s of next input byte */
4097 PyObject *result; /* result string object */
4098 char *p; /* next free byte in output buffer */
4099 Py_ssize_t nallocated; /* number of result bytes allocated */
4100 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004101 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004102 PyObject *errorHandler = NULL;
4103 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004104 int kind;
4105 void *data;
4106 Py_ssize_t size;
4107 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4108#if SIZEOF_WCHAR_T == 2
4109 Py_ssize_t wchar_offset = 0;
4110#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004112 if (!PyUnicode_Check(unicode)) {
4113 PyErr_BadArgument();
4114 return NULL;
4115 }
4116
4117 if (PyUnicode_READY(unicode) == -1)
4118 return NULL;
4119
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004120 if (PyUnicode_UTF8(unicode))
4121 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4122 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004123
4124 kind = PyUnicode_KIND(unicode);
4125 data = PyUnicode_DATA(unicode);
4126 size = PyUnicode_GET_LENGTH(unicode);
4127
Tim Peters602f7402002-04-27 18:03:26 +00004128 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129
Tim Peters602f7402002-04-27 18:03:26 +00004130 if (size <= MAX_SHORT_UNICHARS) {
4131 /* Write into the stack buffer; nallocated can't overflow.
4132 * At the end, we'll allocate exactly as much heap space as it
4133 * turns out we need.
4134 */
4135 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004136 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004137 p = stackbuf;
4138 }
4139 else {
4140 /* Overallocate on the heap, and give the excess back at the end. */
4141 nallocated = size * 4;
4142 if (nallocated / 4 != size) /* overflow! */
4143 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004144 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004145 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004146 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004147 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004148 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004149
Tim Peters602f7402002-04-27 18:03:26 +00004150 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004151 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004152
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004153 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004154 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004156
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004158 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004159 *p++ = (char)(0xc0 | (ch >> 6));
4160 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004161 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004162 Py_ssize_t newpos;
4163 PyObject *rep;
4164 Py_ssize_t repsize, k, startpos;
4165 startpos = i-1;
4166#if SIZEOF_WCHAR_T == 2
4167 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004168#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004169 rep = unicode_encode_call_errorhandler(
4170 errors, &errorHandler, "utf-8", "surrogates not allowed",
4171 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4172 &exc, startpos, startpos+1, &newpos);
4173 if (!rep)
4174 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004176 if (PyBytes_Check(rep))
4177 repsize = PyBytes_GET_SIZE(rep);
4178 else
4179 repsize = PyUnicode_GET_SIZE(rep);
4180
4181 if (repsize > 4) {
4182 Py_ssize_t offset;
4183
4184 if (result == NULL)
4185 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004186 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004187 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004189 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4190 /* integer overflow */
4191 PyErr_NoMemory();
4192 goto error;
4193 }
4194 nallocated += repsize - 4;
4195 if (result != NULL) {
4196 if (_PyBytes_Resize(&result, nallocated) < 0)
4197 goto error;
4198 } else {
4199 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004200 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004201 goto error;
4202 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4203 }
4204 p = PyBytes_AS_STRING(result) + offset;
4205 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004207 if (PyBytes_Check(rep)) {
4208 char *prep = PyBytes_AS_STRING(rep);
4209 for(k = repsize; k > 0; k--)
4210 *p++ = *prep++;
4211 } else /* rep is unicode */ {
4212 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4213 Py_UNICODE c;
4214
4215 for(k=0; k<repsize; k++) {
4216 c = prep[k];
4217 if (0x80 <= c) {
4218 raise_encode_exception(&exc, "utf-8",
4219 PyUnicode_AS_UNICODE(unicode),
4220 size, i-1, i,
4221 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004222 goto error;
4223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004224 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004225 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004226 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004227 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004228 } else if (ch < 0x10000) {
4229 *p++ = (char)(0xe0 | (ch >> 12));
4230 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4231 *p++ = (char)(0x80 | (ch & 0x3f));
4232 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004233 /* Encode UCS4 Unicode ordinals */
4234 *p++ = (char)(0xf0 | (ch >> 18));
4235 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4236 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4237 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238#if SIZEOF_WCHAR_T == 2
4239 wchar_offset++;
4240#endif
Tim Peters602f7402002-04-27 18:03:26 +00004241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004243
Guido van Rossum98297ee2007-11-06 21:34:58 +00004244 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004245 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004246 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004247 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004248 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004249 }
4250 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004251 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004252 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004253 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004254 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004256
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004257 Py_XDECREF(errorHandler);
4258 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004259 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004260 error:
4261 Py_XDECREF(errorHandler);
4262 Py_XDECREF(exc);
4263 Py_XDECREF(result);
4264 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004265
Tim Peters602f7402002-04-27 18:03:26 +00004266#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267}
4268
Alexander Belopolsky40018472011-02-26 01:02:56 +00004269PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004270PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4271 Py_ssize_t size,
4272 const char *errors)
4273{
4274 PyObject *v, *unicode;
4275
4276 unicode = PyUnicode_FromUnicode(s, size);
4277 if (unicode == NULL)
4278 return NULL;
4279 v = _PyUnicode_AsUTF8String(unicode, errors);
4280 Py_DECREF(unicode);
4281 return v;
4282}
4283
4284PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004285PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004287 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288}
4289
Walter Dörwald41980ca2007-08-16 21:55:45 +00004290/* --- UTF-32 Codec ------------------------------------------------------- */
4291
4292PyObject *
4293PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 Py_ssize_t size,
4295 const char *errors,
4296 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004297{
4298 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4299}
4300
4301PyObject *
4302PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004303 Py_ssize_t size,
4304 const char *errors,
4305 int *byteorder,
4306 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004307{
4308 const char *starts = s;
4309 Py_ssize_t startinpos;
4310 Py_ssize_t endinpos;
4311 Py_ssize_t outpos;
4312 PyUnicodeObject *unicode;
4313 Py_UNICODE *p;
4314#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004315 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004316 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004317#else
4318 const int pairs = 0;
4319#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004320 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004321 int bo = 0; /* assume native ordering by default */
4322 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004323 /* Offsets from q for retrieving bytes in the right order. */
4324#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4325 int iorder[] = {0, 1, 2, 3};
4326#else
4327 int iorder[] = {3, 2, 1, 0};
4328#endif
4329 PyObject *errorHandler = NULL;
4330 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004331
Walter Dörwald41980ca2007-08-16 21:55:45 +00004332 q = (unsigned char *)s;
4333 e = q + size;
4334
4335 if (byteorder)
4336 bo = *byteorder;
4337
4338 /* Check for BOM marks (U+FEFF) in the input and adjust current
4339 byte order setting accordingly. In native mode, the leading BOM
4340 mark is skipped, in all other modes, it is copied to the output
4341 stream as-is (giving a ZWNBSP character). */
4342 if (bo == 0) {
4343 if (size >= 4) {
4344 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004345 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004346#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004347 if (bom == 0x0000FEFF) {
4348 q += 4;
4349 bo = -1;
4350 }
4351 else if (bom == 0xFFFE0000) {
4352 q += 4;
4353 bo = 1;
4354 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004355#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004356 if (bom == 0x0000FEFF) {
4357 q += 4;
4358 bo = 1;
4359 }
4360 else if (bom == 0xFFFE0000) {
4361 q += 4;
4362 bo = -1;
4363 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004364#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004366 }
4367
4368 if (bo == -1) {
4369 /* force LE */
4370 iorder[0] = 0;
4371 iorder[1] = 1;
4372 iorder[2] = 2;
4373 iorder[3] = 3;
4374 }
4375 else if (bo == 1) {
4376 /* force BE */
4377 iorder[0] = 3;
4378 iorder[1] = 2;
4379 iorder[2] = 1;
4380 iorder[3] = 0;
4381 }
4382
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004383 /* On narrow builds we split characters outside the BMP into two
4384 codepoints => count how much extra space we need. */
4385#ifndef Py_UNICODE_WIDE
4386 for (qq = q; qq < e; qq += 4)
4387 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4388 pairs++;
4389#endif
4390
4391 /* This might be one to much, because of a BOM */
4392 unicode = _PyUnicode_New((size+3)/4+pairs);
4393 if (!unicode)
4394 return NULL;
4395 if (size == 0)
4396 return (PyObject *)unicode;
4397
4398 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004399 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004400
Walter Dörwald41980ca2007-08-16 21:55:45 +00004401 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004402 Py_UCS4 ch;
4403 /* remaining bytes at the end? (size should be divisible by 4) */
4404 if (e-q<4) {
4405 if (consumed)
4406 break;
4407 errmsg = "truncated data";
4408 startinpos = ((const char *)q)-starts;
4409 endinpos = ((const char *)e)-starts;
4410 goto utf32Error;
4411 /* The remaining input chars are ignored if the callback
4412 chooses to skip the input */
4413 }
4414 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4415 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004416
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 if (ch >= 0x110000)
4418 {
4419 errmsg = "codepoint not in range(0x110000)";
4420 startinpos = ((const char *)q)-starts;
4421 endinpos = startinpos+4;
4422 goto utf32Error;
4423 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004424#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 if (ch >= 0x10000)
4426 {
4427 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4428 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4429 }
4430 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004431#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004432 *p++ = ch;
4433 q += 4;
4434 continue;
4435 utf32Error:
4436 outpos = p-PyUnicode_AS_UNICODE(unicode);
4437 if (unicode_decode_call_errorhandler(
4438 errors, &errorHandler,
4439 "utf32", errmsg,
4440 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4441 &unicode, &outpos, &p))
4442 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004443 }
4444
4445 if (byteorder)
4446 *byteorder = bo;
4447
4448 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004450
4451 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004452 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004453 goto onError;
4454
4455 Py_XDECREF(errorHandler);
4456 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004457 if (PyUnicode_READY(unicode) == -1) {
4458 Py_DECREF(unicode);
4459 return NULL;
4460 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004461 return (PyObject *)unicode;
4462
Benjamin Peterson29060642009-01-31 22:14:21 +00004463 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004464 Py_DECREF(unicode);
4465 Py_XDECREF(errorHandler);
4466 Py_XDECREF(exc);
4467 return NULL;
4468}
4469
4470PyObject *
4471PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 Py_ssize_t size,
4473 const char *errors,
4474 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004475{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004476 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004477 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004478 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004479#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004480 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004481#else
4482 const int pairs = 0;
4483#endif
4484 /* Offsets from p for storing byte pairs in the right order. */
4485#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4486 int iorder[] = {0, 1, 2, 3};
4487#else
4488 int iorder[] = {3, 2, 1, 0};
4489#endif
4490
Benjamin Peterson29060642009-01-31 22:14:21 +00004491#define STORECHAR(CH) \
4492 do { \
4493 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4494 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4495 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4496 p[iorder[0]] = (CH) & 0xff; \
4497 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004498 } while(0)
4499
4500 /* In narrow builds we can output surrogate pairs as one codepoint,
4501 so we need less space. */
4502#ifndef Py_UNICODE_WIDE
4503 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4505 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4506 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004507#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004508 nsize = (size - pairs + (byteorder == 0));
4509 bytesize = nsize * 4;
4510 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004512 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004513 if (v == NULL)
4514 return NULL;
4515
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004516 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004517 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004519 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004520 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004521
4522 if (byteorder == -1) {
4523 /* force LE */
4524 iorder[0] = 0;
4525 iorder[1] = 1;
4526 iorder[2] = 2;
4527 iorder[3] = 3;
4528 }
4529 else if (byteorder == 1) {
4530 /* force BE */
4531 iorder[0] = 3;
4532 iorder[1] = 2;
4533 iorder[2] = 1;
4534 iorder[3] = 0;
4535 }
4536
4537 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004538 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004539#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4541 Py_UCS4 ch2 = *s;
4542 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4543 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4544 s++;
4545 size--;
4546 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004547 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004548#endif
4549 STORECHAR(ch);
4550 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004551
4552 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004553 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004554#undef STORECHAR
4555}
4556
Alexander Belopolsky40018472011-02-26 01:02:56 +00004557PyObject *
4558PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004559{
4560 if (!PyUnicode_Check(unicode)) {
4561 PyErr_BadArgument();
4562 return NULL;
4563 }
4564 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004565 PyUnicode_GET_SIZE(unicode),
4566 NULL,
4567 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004568}
4569
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570/* --- UTF-16 Codec ------------------------------------------------------- */
4571
Tim Peters772747b2001-08-09 22:21:55 +00004572PyObject *
4573PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 Py_ssize_t size,
4575 const char *errors,
4576 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577{
Walter Dörwald69652032004-09-07 20:24:22 +00004578 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4579}
4580
Antoine Pitrouab868312009-01-10 15:40:25 +00004581/* Two masks for fast checking of whether a C 'long' may contain
4582 UTF16-encoded surrogate characters. This is an efficient heuristic,
4583 assuming that non-surrogate characters with a code point >= 0x8000 are
4584 rare in most input.
4585 FAST_CHAR_MASK is used when the input is in native byte ordering,
4586 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004587*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004588#if (SIZEOF_LONG == 8)
4589# define FAST_CHAR_MASK 0x8000800080008000L
4590# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4591#elif (SIZEOF_LONG == 4)
4592# define FAST_CHAR_MASK 0x80008000L
4593# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4594#else
4595# error C 'long' size should be either 4 or 8!
4596#endif
4597
Walter Dörwald69652032004-09-07 20:24:22 +00004598PyObject *
4599PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004600 Py_ssize_t size,
4601 const char *errors,
4602 int *byteorder,
4603 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004604{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004606 Py_ssize_t startinpos;
4607 Py_ssize_t endinpos;
4608 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609 PyUnicodeObject *unicode;
4610 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004611 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004612 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004613 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004614 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004615 /* Offsets from q for retrieving byte pairs in the right order. */
4616#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4617 int ihi = 1, ilo = 0;
4618#else
4619 int ihi = 0, ilo = 1;
4620#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 PyObject *errorHandler = NULL;
4622 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623
4624 /* Note: size will always be longer than the resulting Unicode
4625 character count */
4626 unicode = _PyUnicode_New(size);
4627 if (!unicode)
4628 return NULL;
4629 if (size == 0)
4630 return (PyObject *)unicode;
4631
4632 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004633 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004634 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004635 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636
4637 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004638 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004639
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004640 /* Check for BOM marks (U+FEFF) in the input and adjust current
4641 byte order setting accordingly. In native mode, the leading BOM
4642 mark is skipped, in all other modes, it is copied to the output
4643 stream as-is (giving a ZWNBSP character). */
4644 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004645 if (size >= 2) {
4646 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004647#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 if (bom == 0xFEFF) {
4649 q += 2;
4650 bo = -1;
4651 }
4652 else if (bom == 0xFFFE) {
4653 q += 2;
4654 bo = 1;
4655 }
Tim Petersced69f82003-09-16 20:30:58 +00004656#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 if (bom == 0xFEFF) {
4658 q += 2;
4659 bo = 1;
4660 }
4661 else if (bom == 0xFFFE) {
4662 q += 2;
4663 bo = -1;
4664 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004665#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004666 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668
Tim Peters772747b2001-08-09 22:21:55 +00004669 if (bo == -1) {
4670 /* force LE */
4671 ihi = 1;
4672 ilo = 0;
4673 }
4674 else if (bo == 1) {
4675 /* force BE */
4676 ihi = 0;
4677 ilo = 1;
4678 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004679#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4680 native_ordering = ilo < ihi;
4681#else
4682 native_ordering = ilo > ihi;
4683#endif
Tim Peters772747b2001-08-09 22:21:55 +00004684
Antoine Pitrouab868312009-01-10 15:40:25 +00004685 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004686 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004687 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004688 /* First check for possible aligned read of a C 'long'. Unaligned
4689 reads are more expensive, better to defer to another iteration. */
4690 if (!((size_t) q & LONG_PTR_MASK)) {
4691 /* Fast path for runs of non-surrogate chars. */
4692 register const unsigned char *_q = q;
4693 Py_UNICODE *_p = p;
4694 if (native_ordering) {
4695 /* Native ordering is simple: as long as the input cannot
4696 possibly contain a surrogate char, do an unrolled copy
4697 of several 16-bit code points to the target object.
4698 The non-surrogate check is done on several input bytes
4699 at a time (as many as a C 'long' can contain). */
4700 while (_q < aligned_end) {
4701 unsigned long data = * (unsigned long *) _q;
4702 if (data & FAST_CHAR_MASK)
4703 break;
4704 _p[0] = ((unsigned short *) _q)[0];
4705 _p[1] = ((unsigned short *) _q)[1];
4706#if (SIZEOF_LONG == 8)
4707 _p[2] = ((unsigned short *) _q)[2];
4708 _p[3] = ((unsigned short *) _q)[3];
4709#endif
4710 _q += SIZEOF_LONG;
4711 _p += SIZEOF_LONG / 2;
4712 }
4713 }
4714 else {
4715 /* Byteswapped ordering is similar, but we must decompose
4716 the copy bytewise, and take care of zero'ing out the
4717 upper bytes if the target object is in 32-bit units
4718 (that is, in UCS-4 builds). */
4719 while (_q < aligned_end) {
4720 unsigned long data = * (unsigned long *) _q;
4721 if (data & SWAPPED_FAST_CHAR_MASK)
4722 break;
4723 /* Zero upper bytes in UCS-4 builds */
4724#if (Py_UNICODE_SIZE > 2)
4725 _p[0] = 0;
4726 _p[1] = 0;
4727#if (SIZEOF_LONG == 8)
4728 _p[2] = 0;
4729 _p[3] = 0;
4730#endif
4731#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004732 /* Issue #4916; UCS-4 builds on big endian machines must
4733 fill the two last bytes of each 4-byte unit. */
4734#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4735# define OFF 2
4736#else
4737# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004738#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004739 ((unsigned char *) _p)[OFF + 1] = _q[0];
4740 ((unsigned char *) _p)[OFF + 0] = _q[1];
4741 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4742 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4743#if (SIZEOF_LONG == 8)
4744 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4745 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4746 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4747 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4748#endif
4749#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004750 _q += SIZEOF_LONG;
4751 _p += SIZEOF_LONG / 2;
4752 }
4753 }
4754 p = _p;
4755 q = _q;
4756 if (q >= e)
4757 break;
4758 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004759 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004760
Benjamin Peterson14339b62009-01-31 16:36:08 +00004761 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004762
4763 if (ch < 0xD800 || ch > 0xDFFF) {
4764 *p++ = ch;
4765 continue;
4766 }
4767
4768 /* UTF-16 code pair: */
4769 if (q > e) {
4770 errmsg = "unexpected end of data";
4771 startinpos = (((const char *)q) - 2) - starts;
4772 endinpos = ((const char *)e) + 1 - starts;
4773 goto utf16Error;
4774 }
4775 if (0xD800 <= ch && ch <= 0xDBFF) {
4776 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4777 q += 2;
4778 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004779#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 *p++ = ch;
4781 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004782#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004783 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004784#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004785 continue;
4786 }
4787 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004788 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004789 startinpos = (((const char *)q)-4)-starts;
4790 endinpos = startinpos+2;
4791 goto utf16Error;
4792 }
4793
Benjamin Peterson14339b62009-01-31 16:36:08 +00004794 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004795 errmsg = "illegal encoding";
4796 startinpos = (((const char *)q)-2)-starts;
4797 endinpos = startinpos+2;
4798 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004799
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 utf16Error:
4801 outpos = p - PyUnicode_AS_UNICODE(unicode);
4802 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004803 errors,
4804 &errorHandler,
4805 "utf16", errmsg,
4806 &starts,
4807 (const char **)&e,
4808 &startinpos,
4809 &endinpos,
4810 &exc,
4811 (const char **)&q,
4812 &unicode,
4813 &outpos,
4814 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004815 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004817 /* remaining byte at the end? (size should be even) */
4818 if (e == q) {
4819 if (!consumed) {
4820 errmsg = "truncated data";
4821 startinpos = ((const char *)q) - starts;
4822 endinpos = ((const char *)e) + 1 - starts;
4823 outpos = p - PyUnicode_AS_UNICODE(unicode);
4824 if (unicode_decode_call_errorhandler(
4825 errors,
4826 &errorHandler,
4827 "utf16", errmsg,
4828 &starts,
4829 (const char **)&e,
4830 &startinpos,
4831 &endinpos,
4832 &exc,
4833 (const char **)&q,
4834 &unicode,
4835 &outpos,
4836 &p))
4837 goto onError;
4838 /* The remaining input chars are ignored if the callback
4839 chooses to skip the input */
4840 }
4841 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842
4843 if (byteorder)
4844 *byteorder = bo;
4845
Walter Dörwald69652032004-09-07 20:24:22 +00004846 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004847 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004848
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004850 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 goto onError;
4852
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853 Py_XDECREF(errorHandler);
4854 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004855 if (PyUnicode_READY(unicode) == -1) {
4856 Py_DECREF(unicode);
4857 return NULL;
4858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 return (PyObject *)unicode;
4860
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004863 Py_XDECREF(errorHandler);
4864 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 return NULL;
4866}
4867
Antoine Pitrouab868312009-01-10 15:40:25 +00004868#undef FAST_CHAR_MASK
4869#undef SWAPPED_FAST_CHAR_MASK
4870
Tim Peters772747b2001-08-09 22:21:55 +00004871PyObject *
4872PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004873 Py_ssize_t size,
4874 const char *errors,
4875 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004877 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004878 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004879 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004880#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004881 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004882#else
4883 const int pairs = 0;
4884#endif
Tim Peters772747b2001-08-09 22:21:55 +00004885 /* Offsets from p for storing byte pairs in the right order. */
4886#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4887 int ihi = 1, ilo = 0;
4888#else
4889 int ihi = 0, ilo = 1;
4890#endif
4891
Benjamin Peterson29060642009-01-31 22:14:21 +00004892#define STORECHAR(CH) \
4893 do { \
4894 p[ihi] = ((CH) >> 8) & 0xff; \
4895 p[ilo] = (CH) & 0xff; \
4896 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004897 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004899#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004900 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 if (s[i] >= 0x10000)
4902 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004903#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004904 /* 2 * (size + pairs + (byteorder == 0)) */
4905 if (size > PY_SSIZE_T_MAX ||
4906 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004908 nsize = size + pairs + (byteorder == 0);
4909 bytesize = nsize * 2;
4910 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004911 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004912 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913 if (v == NULL)
4914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004916 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004918 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004919 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004920 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004921
4922 if (byteorder == -1) {
4923 /* force LE */
4924 ihi = 1;
4925 ilo = 0;
4926 }
4927 else if (byteorder == 1) {
4928 /* force BE */
4929 ihi = 0;
4930 ilo = 1;
4931 }
4932
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004933 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 Py_UNICODE ch = *s++;
4935 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004936#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 if (ch >= 0x10000) {
4938 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4939 ch = 0xD800 | ((ch-0x10000) >> 10);
4940 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004941#endif
Tim Peters772747b2001-08-09 22:21:55 +00004942 STORECHAR(ch);
4943 if (ch2)
4944 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004945 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004946
4947 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004948 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004949#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950}
4951
Alexander Belopolsky40018472011-02-26 01:02:56 +00004952PyObject *
4953PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954{
4955 if (!PyUnicode_Check(unicode)) {
4956 PyErr_BadArgument();
4957 return NULL;
4958 }
4959 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004960 PyUnicode_GET_SIZE(unicode),
4961 NULL,
4962 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963}
4964
4965/* --- Unicode Escape Codec ----------------------------------------------- */
4966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004967/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4968 if all the escapes in the string make it still a valid ASCII string.
4969 Returns -1 if any escapes were found which cause the string to
4970 pop out of ASCII range. Otherwise returns the length of the
4971 required buffer to hold the string.
4972 */
4973Py_ssize_t
4974length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4975{
4976 const unsigned char *p = (const unsigned char *)s;
4977 const unsigned char *end = p + size;
4978 Py_ssize_t length = 0;
4979
4980 if (size < 0)
4981 return -1;
4982
4983 for (; p < end; ++p) {
4984 if (*p > 127) {
4985 /* Non-ASCII */
4986 return -1;
4987 }
4988 else if (*p != '\\') {
4989 /* Normal character */
4990 ++length;
4991 }
4992 else {
4993 /* Backslash-escape, check next char */
4994 ++p;
4995 /* Escape sequence reaches till end of string or
4996 non-ASCII follow-up. */
4997 if (p >= end || *p > 127)
4998 return -1;
4999 switch (*p) {
5000 case '\n':
5001 /* backslash + \n result in zero characters */
5002 break;
5003 case '\\': case '\'': case '\"':
5004 case 'b': case 'f': case 't':
5005 case 'n': case 'r': case 'v': case 'a':
5006 ++length;
5007 break;
5008 case '0': case '1': case '2': case '3':
5009 case '4': case '5': case '6': case '7':
5010 case 'x': case 'u': case 'U': case 'N':
5011 /* these do not guarantee ASCII characters */
5012 return -1;
5013 default:
5014 /* count the backslash + the other character */
5015 length += 2;
5016 }
5017 }
5018 }
5019 return length;
5020}
5021
5022/* Similar to PyUnicode_WRITE but either write into wstr field
5023 or treat string as ASCII. */
5024#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5025 do { \
5026 if ((kind) != PyUnicode_WCHAR_KIND) \
5027 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5028 else \
5029 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5030 } while (0)
5031
5032#define WRITE_WSTR(buf, index, value) \
5033 assert(kind == PyUnicode_WCHAR_KIND), \
5034 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5035
5036
Fredrik Lundh06d12682001-01-24 07:59:11 +00005037static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005038
Alexander Belopolsky40018472011-02-26 01:02:56 +00005039PyObject *
5040PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005041 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005042 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005044 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005045 Py_ssize_t startinpos;
5046 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005047 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005049 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005051 char* message;
5052 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005053 PyObject *errorHandler = NULL;
5054 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005055 Py_ssize_t ascii_length;
5056 Py_ssize_t i;
5057 int kind;
5058 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005060 ascii_length = length_of_escaped_ascii_string(s, size);
5061
5062 /* After length_of_escaped_ascii_string() there are two alternatives,
5063 either the string is pure ASCII with named escapes like \n, etc.
5064 and we determined it's exact size (common case)
5065 or it contains \x, \u, ... escape sequences. then we create a
5066 legacy wchar string and resize it at the end of this function. */
5067 if (ascii_length >= 0) {
5068 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5069 if (!v)
5070 goto onError;
5071 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5072 kind = PyUnicode_1BYTE_KIND;
5073 data = PyUnicode_DATA(v);
5074 }
5075 else {
5076 /* Escaped strings will always be longer than the resulting
5077 Unicode string, so we start with size here and then reduce the
5078 length after conversion to the true value.
5079 (but if the error callback returns a long replacement string
5080 we'll have to allocate more space) */
5081 v = _PyUnicode_New(size);
5082 if (!v)
5083 goto onError;
5084 kind = PyUnicode_WCHAR_KIND;
5085 data = PyUnicode_AS_UNICODE(v);
5086 }
5087
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088 if (size == 0)
5089 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005090 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005092
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093 while (s < end) {
5094 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005095 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005096 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005098 if (kind == PyUnicode_WCHAR_KIND) {
5099 assert(i < _PyUnicode_WSTR_LENGTH(v));
5100 }
5101 else {
5102 /* The only case in which i == ascii_length is a backslash
5103 followed by a newline. */
5104 assert(i <= ascii_length);
5105 }
5106
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 /* Non-escape characters are interpreted as Unicode ordinals */
5108 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005109 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 continue;
5111 }
5112
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 /* \ - Escapes */
5115 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005116 c = *s++;
5117 if (s > end)
5118 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005119
5120 if (kind == PyUnicode_WCHAR_KIND) {
5121 assert(i < _PyUnicode_WSTR_LENGTH(v));
5122 }
5123 else {
5124 /* The only case in which i == ascii_length is a backslash
5125 followed by a newline. */
5126 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5127 }
5128
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005129 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005133 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5134 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5135 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5136 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5137 /* FF */
5138 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5139 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5140 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5141 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5142 /* VT */
5143 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5144 /* BEL, not classic C */
5145 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 case '0': case '1': case '2': case '3':
5149 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005150 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005151 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005152 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005153 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005154 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005156 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 break;
5158
Benjamin Peterson29060642009-01-31 22:14:21 +00005159 /* hex escapes */
5160 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005162 digits = 2;
5163 message = "truncated \\xXX escape";
5164 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005168 digits = 4;
5169 message = "truncated \\uXXXX escape";
5170 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005173 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005174 digits = 8;
5175 message = "truncated \\UXXXXXXXX escape";
5176 hexescape:
5177 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005178 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179 if (s+digits>end) {
5180 endinpos = size;
5181 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 errors, &errorHandler,
5183 "unicodeescape", "end of string in escape sequence",
5184 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005185 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005186 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005187 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005188 goto nextByte;
5189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005190 for (j = 0; j < digits; ++j) {
5191 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005192 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005193 endinpos = (s+j+1)-starts;
5194 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005195 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 errors, &errorHandler,
5197 "unicodeescape", message,
5198 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005199 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005200 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005201 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005202 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005203 }
5204 chr = (chr<<4) & ~0xF;
5205 if (c >= '0' && c <= '9')
5206 chr += c - '0';
5207 else if (c >= 'a' && c <= 'f')
5208 chr += 10 + c - 'a';
5209 else
5210 chr += 10 + c - 'A';
5211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005212 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005213 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 /* _decoding_error will have already written into the
5215 target buffer. */
5216 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005217 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005218 /* when we get here, chr is a 32-bit unicode character */
5219 if (chr <= 0xffff)
5220 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005221 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005222 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005223 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005224 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005225#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005226 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005227#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005228 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005229 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5230 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005231#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005232 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005233 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005234 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005235 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 errors, &errorHandler,
5237 "unicodeescape", "illegal Unicode character",
5238 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005239 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005240 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005241 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005242 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005243 break;
5244
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005246 case 'N':
5247 message = "malformed \\N character escape";
5248 if (ucnhash_CAPI == NULL) {
5249 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005250 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5251 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005252 if (ucnhash_CAPI == NULL)
5253 goto ucnhashError;
5254 }
5255 if (*s == '{') {
5256 const char *start = s+1;
5257 /* look for the closing brace */
5258 while (*s != '}' && s < end)
5259 s++;
5260 if (s > start && s < end && *s == '}') {
5261 /* found a name. look it up in the unicode database */
5262 message = "unknown Unicode character name";
5263 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005264 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5265 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005266 goto store;
5267 }
5268 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005269 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005270 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005271 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 errors, &errorHandler,
5273 "unicodeescape", message,
5274 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005275 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005276 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005277 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005278 break;
5279
5280 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005281 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005282 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005283 message = "\\ at end of string";
5284 s--;
5285 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005286 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005287 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 errors, &errorHandler,
5289 "unicodeescape", message,
5290 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005291 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005292 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005293 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005294 }
5295 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005296 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5297 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005298 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005299 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005302 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005304 /* Ensure the length prediction worked in case of ASCII strings */
5305 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5306
5307 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5308 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005309 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005310 Py_XDECREF(errorHandler);
5311 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005313
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005315 PyErr_SetString(
5316 PyExc_UnicodeError,
5317 "\\N escapes not supported (can't load unicodedata module)"
5318 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005319 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005320 Py_XDECREF(errorHandler);
5321 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005322 return NULL;
5323
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005326 Py_XDECREF(errorHandler);
5327 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 return NULL;
5329}
5330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005331#undef WRITE_ASCII_OR_WSTR
5332#undef WRITE_WSTR
5333
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334/* Return a Unicode-Escape string version of the Unicode object.
5335
5336 If quotes is true, the string is enclosed in u"" or u'' quotes as
5337 appropriate.
5338
5339*/
5340
Walter Dörwald79e913e2007-05-12 11:08:06 +00005341static const char *hexdigits = "0123456789abcdef";
5342
Alexander Belopolsky40018472011-02-26 01:02:56 +00005343PyObject *
5344PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005345 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005347 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005350#ifdef Py_UNICODE_WIDE
5351 const Py_ssize_t expandsize = 10;
5352#else
5353 const Py_ssize_t expandsize = 6;
5354#endif
5355
Thomas Wouters89f507f2006-12-13 04:49:30 +00005356 /* XXX(nnorwitz): rather than over-allocating, it would be
5357 better to choose a different scheme. Perhaps scan the
5358 first N-chars of the string and allocate based on that size.
5359 */
5360 /* Initial allocation is based on the longest-possible unichr
5361 escape.
5362
5363 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5364 unichr, so in this case it's the longest unichr escape. In
5365 narrow (UTF-16) builds this is five chars per source unichr
5366 since there are two unichrs in the surrogate pair, so in narrow
5367 (UTF-16) builds it's not the longest unichr escape.
5368
5369 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5370 so in the narrow (UTF-16) build case it's the longest unichr
5371 escape.
5372 */
5373
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005374 if (size == 0)
5375 return PyBytes_FromStringAndSize(NULL, 0);
5376
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005377 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005379
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005380 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 2
5382 + expandsize*size
5383 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 if (repr == NULL)
5385 return NULL;
5386
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005387 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 while (size-- > 0) {
5390 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005391
Walter Dörwald79e913e2007-05-12 11:08:06 +00005392 /* Escape backslashes */
5393 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 *p++ = '\\';
5395 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005396 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005397 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005398
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005399#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005400 /* Map 21-bit characters to '\U00xxxxxx' */
5401 else if (ch >= 0x10000) {
5402 *p++ = '\\';
5403 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005404 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5405 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5406 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5407 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5408 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5409 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5410 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5411 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005413 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005414#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5416 else if (ch >= 0xD800 && ch < 0xDC00) {
5417 Py_UNICODE ch2;
5418 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005419
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 ch2 = *s++;
5421 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005422 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5424 *p++ = '\\';
5425 *p++ = 'U';
5426 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5427 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5428 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5429 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5430 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5431 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5432 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5433 *p++ = hexdigits[ucs & 0x0000000F];
5434 continue;
5435 }
5436 /* Fall through: isolated surrogates are copied as-is */
5437 s--;
5438 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005439 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005440#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005441
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005443 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 *p++ = '\\';
5445 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005446 *p++ = hexdigits[(ch >> 12) & 0x000F];
5447 *p++ = hexdigits[(ch >> 8) & 0x000F];
5448 *p++ = hexdigits[(ch >> 4) & 0x000F];
5449 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005451
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005452 /* Map special whitespace to '\t', \n', '\r' */
5453 else if (ch == '\t') {
5454 *p++ = '\\';
5455 *p++ = 't';
5456 }
5457 else if (ch == '\n') {
5458 *p++ = '\\';
5459 *p++ = 'n';
5460 }
5461 else if (ch == '\r') {
5462 *p++ = '\\';
5463 *p++ = 'r';
5464 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005465
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005466 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005467 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005469 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005470 *p++ = hexdigits[(ch >> 4) & 0x000F];
5471 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005472 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005473
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 /* Copy everything else as-is */
5475 else
5476 *p++ = (char) ch;
5477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005479 assert(p - PyBytes_AS_STRING(repr) > 0);
5480 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5481 return NULL;
5482 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483}
5484
Alexander Belopolsky40018472011-02-26 01:02:56 +00005485PyObject *
5486PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005488 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 if (!PyUnicode_Check(unicode)) {
5490 PyErr_BadArgument();
5491 return NULL;
5492 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005493 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5494 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005495 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496}
5497
5498/* --- Raw Unicode Escape Codec ------------------------------------------- */
5499
Alexander Belopolsky40018472011-02-26 01:02:56 +00005500PyObject *
5501PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005502 Py_ssize_t size,
5503 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005506 Py_ssize_t startinpos;
5507 Py_ssize_t endinpos;
5508 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 const char *end;
5512 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005513 PyObject *errorHandler = NULL;
5514 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005515
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 /* Escaped strings will always be longer than the resulting
5517 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005518 length after conversion to the true value. (But decoding error
5519 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 v = _PyUnicode_New(size);
5521 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005524 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005525 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 end = s + size;
5527 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 unsigned char c;
5529 Py_UCS4 x;
5530 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005531 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 /* Non-escape characters are interpreted as Unicode ordinals */
5534 if (*s != '\\') {
5535 *p++ = (unsigned char)*s++;
5536 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005537 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 startinpos = s-starts;
5539
5540 /* \u-escapes are only interpreted iff the number of leading
5541 backslashes if odd */
5542 bs = s;
5543 for (;s < end;) {
5544 if (*s != '\\')
5545 break;
5546 *p++ = (unsigned char)*s++;
5547 }
5548 if (((s - bs) & 1) == 0 ||
5549 s >= end ||
5550 (*s != 'u' && *s != 'U')) {
5551 continue;
5552 }
5553 p--;
5554 count = *s=='u' ? 4 : 8;
5555 s++;
5556
5557 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5558 outpos = p-PyUnicode_AS_UNICODE(v);
5559 for (x = 0, i = 0; i < count; ++i, ++s) {
5560 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005561 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 endinpos = s-starts;
5563 if (unicode_decode_call_errorhandler(
5564 errors, &errorHandler,
5565 "rawunicodeescape", "truncated \\uXXXX",
5566 &starts, &end, &startinpos, &endinpos, &exc, &s,
5567 &v, &outpos, &p))
5568 goto onError;
5569 goto nextByte;
5570 }
5571 x = (x<<4) & ~0xF;
5572 if (c >= '0' && c <= '9')
5573 x += c - '0';
5574 else if (c >= 'a' && c <= 'f')
5575 x += 10 + c - 'a';
5576 else
5577 x += 10 + c - 'A';
5578 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005579 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 /* UCS-2 character */
5581 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005582 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 /* UCS-4 character. Either store directly, or as
5584 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005585#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005587#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 x -= 0x10000L;
5589 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5590 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005591#endif
5592 } else {
5593 endinpos = s-starts;
5594 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005595 if (unicode_decode_call_errorhandler(
5596 errors, &errorHandler,
5597 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 &starts, &end, &startinpos, &endinpos, &exc, &s,
5599 &v, &outpos, &p))
5600 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005601 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005602 nextByte:
5603 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005605 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607 Py_XDECREF(errorHandler);
5608 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005609 if (PyUnicode_READY(v) == -1) {
5610 Py_DECREF(v);
5611 return NULL;
5612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005614
Benjamin Peterson29060642009-01-31 22:14:21 +00005615 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617 Py_XDECREF(errorHandler);
5618 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 return NULL;
5620}
5621
Alexander Belopolsky40018472011-02-26 01:02:56 +00005622PyObject *
5623PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005624 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005626 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 char *p;
5628 char *q;
5629
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005630#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005631 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005632#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005633 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005634#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005635
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005636 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005638
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005639 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 if (repr == NULL)
5641 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005642 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005643 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005645 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 while (size-- > 0) {
5647 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005648#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 /* Map 32-bit characters to '\Uxxxxxxxx' */
5650 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005651 *p++ = '\\';
5652 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005653 *p++ = hexdigits[(ch >> 28) & 0xf];
5654 *p++ = hexdigits[(ch >> 24) & 0xf];
5655 *p++ = hexdigits[(ch >> 20) & 0xf];
5656 *p++ = hexdigits[(ch >> 16) & 0xf];
5657 *p++ = hexdigits[(ch >> 12) & 0xf];
5658 *p++ = hexdigits[(ch >> 8) & 0xf];
5659 *p++ = hexdigits[(ch >> 4) & 0xf];
5660 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005661 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005662 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005663#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5665 if (ch >= 0xD800 && ch < 0xDC00) {
5666 Py_UNICODE ch2;
5667 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005668
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 ch2 = *s++;
5670 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005671 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5673 *p++ = '\\';
5674 *p++ = 'U';
5675 *p++ = hexdigits[(ucs >> 28) & 0xf];
5676 *p++ = hexdigits[(ucs >> 24) & 0xf];
5677 *p++ = hexdigits[(ucs >> 20) & 0xf];
5678 *p++ = hexdigits[(ucs >> 16) & 0xf];
5679 *p++ = hexdigits[(ucs >> 12) & 0xf];
5680 *p++ = hexdigits[(ucs >> 8) & 0xf];
5681 *p++ = hexdigits[(ucs >> 4) & 0xf];
5682 *p++ = hexdigits[ucs & 0xf];
5683 continue;
5684 }
5685 /* Fall through: isolated surrogates are copied as-is */
5686 s--;
5687 size++;
5688 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005689#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 /* Map 16-bit characters to '\uxxxx' */
5691 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 *p++ = '\\';
5693 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005694 *p++ = hexdigits[(ch >> 12) & 0xf];
5695 *p++ = hexdigits[(ch >> 8) & 0xf];
5696 *p++ = hexdigits[(ch >> 4) & 0xf];
5697 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 /* Copy everything else as-is */
5700 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 *p++ = (char) ch;
5702 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005703 size = p - q;
5704
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005705 assert(size > 0);
5706 if (_PyBytes_Resize(&repr, size) < 0)
5707 return NULL;
5708 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709}
5710
Alexander Belopolsky40018472011-02-26 01:02:56 +00005711PyObject *
5712PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005714 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005716 PyErr_BadArgument();
5717 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005719 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5720 PyUnicode_GET_SIZE(unicode));
5721
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005722 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723}
5724
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005725/* --- Unicode Internal Codec ------------------------------------------- */
5726
Alexander Belopolsky40018472011-02-26 01:02:56 +00005727PyObject *
5728_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005729 Py_ssize_t size,
5730 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005731{
5732 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005733 Py_ssize_t startinpos;
5734 Py_ssize_t endinpos;
5735 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005736 PyUnicodeObject *v;
5737 Py_UNICODE *p;
5738 const char *end;
5739 const char *reason;
5740 PyObject *errorHandler = NULL;
5741 PyObject *exc = NULL;
5742
Neal Norwitzd43069c2006-01-08 01:12:10 +00005743#ifdef Py_UNICODE_WIDE
5744 Py_UNICODE unimax = PyUnicode_GetMax();
5745#endif
5746
Thomas Wouters89f507f2006-12-13 04:49:30 +00005747 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005748 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5749 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005751 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5752 as string was created with the old API. */
5753 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005755 p = PyUnicode_AS_UNICODE(v);
5756 end = s + size;
5757
5758 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005759 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005760 /* We have to sanity check the raw data, otherwise doom looms for
5761 some malformed UCS-4 data. */
5762 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005763#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005764 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005765#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005766 end-s < Py_UNICODE_SIZE
5767 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005769 startinpos = s - starts;
5770 if (end-s < Py_UNICODE_SIZE) {
5771 endinpos = end-starts;
5772 reason = "truncated input";
5773 }
5774 else {
5775 endinpos = s - starts + Py_UNICODE_SIZE;
5776 reason = "illegal code point (> 0x10FFFF)";
5777 }
5778 outpos = p - PyUnicode_AS_UNICODE(v);
5779 if (unicode_decode_call_errorhandler(
5780 errors, &errorHandler,
5781 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005782 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005783 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005784 goto onError;
5785 }
5786 }
5787 else {
5788 p++;
5789 s += Py_UNICODE_SIZE;
5790 }
5791 }
5792
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005793 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005794 goto onError;
5795 Py_XDECREF(errorHandler);
5796 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005797 if (PyUnicode_READY(v) == -1) {
5798 Py_DECREF(v);
5799 return NULL;
5800 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005801 return (PyObject *)v;
5802
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005804 Py_XDECREF(v);
5805 Py_XDECREF(errorHandler);
5806 Py_XDECREF(exc);
5807 return NULL;
5808}
5809
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810/* --- Latin-1 Codec ------------------------------------------------------ */
5811
Alexander Belopolsky40018472011-02-26 01:02:56 +00005812PyObject *
5813PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005814 Py_ssize_t size,
5815 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005818 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819}
5820
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005822static void
5823make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005824 const char *encoding,
5825 const Py_UNICODE *unicode, Py_ssize_t size,
5826 Py_ssize_t startpos, Py_ssize_t endpos,
5827 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 *exceptionObject = PyUnicodeEncodeError_Create(
5831 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 }
5833 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5835 goto onError;
5836 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5837 goto onError;
5838 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5839 goto onError;
5840 return;
5841 onError:
5842 Py_DECREF(*exceptionObject);
5843 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 }
5845}
5846
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005847/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005848static void
5849raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005850 const char *encoding,
5851 const Py_UNICODE *unicode, Py_ssize_t size,
5852 Py_ssize_t startpos, Py_ssize_t endpos,
5853 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005854{
5855 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005856 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005857 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005859}
5860
5861/* error handling callback helper:
5862 build arguments, call the callback and check the arguments,
5863 put the result into newpos and return the replacement string, which
5864 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005865static PyObject *
5866unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005867 PyObject **errorHandler,
5868 const char *encoding, const char *reason,
5869 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5870 Py_ssize_t startpos, Py_ssize_t endpos,
5871 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005872{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005873 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874
5875 PyObject *restuple;
5876 PyObject *resunicode;
5877
5878 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005879 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 }
5883
5884 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005886 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005888
5889 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005890 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005891 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005893 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005894 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 Py_DECREF(restuple);
5896 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005898 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 &resunicode, newpos)) {
5900 Py_DECREF(restuple);
5901 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005902 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005903 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5904 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5905 Py_DECREF(restuple);
5906 return NULL;
5907 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005908 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005910 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005911 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5912 Py_DECREF(restuple);
5913 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005914 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005915 Py_INCREF(resunicode);
5916 Py_DECREF(restuple);
5917 return resunicode;
5918}
5919
Alexander Belopolsky40018472011-02-26 01:02:56 +00005920static PyObject *
5921unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005922 Py_ssize_t size,
5923 const char *errors,
5924 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925{
5926 /* output object */
5927 PyObject *res;
5928 /* pointers to the beginning and end+1 of input */
5929 const Py_UNICODE *startp = p;
5930 const Py_UNICODE *endp = p + size;
5931 /* pointer to the beginning of the unencodable characters */
5932 /* const Py_UNICODE *badp = NULL; */
5933 /* pointer into the output */
5934 char *str;
5935 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005936 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005937 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5938 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005939 PyObject *errorHandler = NULL;
5940 PyObject *exc = NULL;
5941 /* the following variable is used for caching string comparisons
5942 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5943 int known_errorHandler = -1;
5944
5945 /* allocate enough for a simple encoding without
5946 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005947 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005948 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005949 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005950 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005951 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005952 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005953 ressize = size;
5954
5955 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005957
Benjamin Peterson29060642009-01-31 22:14:21 +00005958 /* can we encode this? */
5959 if (c<limit) {
5960 /* no overflow check, because we know that the space is enough */
5961 *str++ = (char)c;
5962 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005963 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 else {
5965 Py_ssize_t unicodepos = p-startp;
5966 Py_ssize_t requiredsize;
5967 PyObject *repunicode;
5968 Py_ssize_t repsize;
5969 Py_ssize_t newpos;
5970 Py_ssize_t respos;
5971 Py_UNICODE *uni2;
5972 /* startpos for collecting unencodable chars */
5973 const Py_UNICODE *collstart = p;
5974 const Py_UNICODE *collend = p;
5975 /* find all unecodable characters */
5976 while ((collend < endp) && ((*collend)>=limit))
5977 ++collend;
5978 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5979 if (known_errorHandler==-1) {
5980 if ((errors==NULL) || (!strcmp(errors, "strict")))
5981 known_errorHandler = 1;
5982 else if (!strcmp(errors, "replace"))
5983 known_errorHandler = 2;
5984 else if (!strcmp(errors, "ignore"))
5985 known_errorHandler = 3;
5986 else if (!strcmp(errors, "xmlcharrefreplace"))
5987 known_errorHandler = 4;
5988 else
5989 known_errorHandler = 0;
5990 }
5991 switch (known_errorHandler) {
5992 case 1: /* strict */
5993 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5994 goto onError;
5995 case 2: /* replace */
5996 while (collstart++<collend)
5997 *str++ = '?'; /* fall through */
5998 case 3: /* ignore */
5999 p = collend;
6000 break;
6001 case 4: /* xmlcharrefreplace */
6002 respos = str - PyBytes_AS_STRING(res);
6003 /* determine replacement size (temporarily (mis)uses p) */
6004 for (p = collstart, repsize = 0; p < collend; ++p) {
6005 if (*p<10)
6006 repsize += 2+1+1;
6007 else if (*p<100)
6008 repsize += 2+2+1;
6009 else if (*p<1000)
6010 repsize += 2+3+1;
6011 else if (*p<10000)
6012 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006013#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 else
6015 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006016#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 else if (*p<100000)
6018 repsize += 2+5+1;
6019 else if (*p<1000000)
6020 repsize += 2+6+1;
6021 else
6022 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006023#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 }
6025 requiredsize = respos+repsize+(endp-collend);
6026 if (requiredsize > ressize) {
6027 if (requiredsize<2*ressize)
6028 requiredsize = 2*ressize;
6029 if (_PyBytes_Resize(&res, requiredsize))
6030 goto onError;
6031 str = PyBytes_AS_STRING(res) + respos;
6032 ressize = requiredsize;
6033 }
6034 /* generate replacement (temporarily (mis)uses p) */
6035 for (p = collstart; p < collend; ++p) {
6036 str += sprintf(str, "&#%d;", (int)*p);
6037 }
6038 p = collend;
6039 break;
6040 default:
6041 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6042 encoding, reason, startp, size, &exc,
6043 collstart-startp, collend-startp, &newpos);
6044 if (repunicode == NULL)
6045 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006046 if (PyBytes_Check(repunicode)) {
6047 /* Directly copy bytes result to output. */
6048 repsize = PyBytes_Size(repunicode);
6049 if (repsize > 1) {
6050 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006051 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006052 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6053 Py_DECREF(repunicode);
6054 goto onError;
6055 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006056 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006057 ressize += repsize-1;
6058 }
6059 memcpy(str, PyBytes_AsString(repunicode), repsize);
6060 str += repsize;
6061 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006062 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006063 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006064 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 /* need more space? (at least enough for what we
6066 have+the replacement+the rest of the string, so
6067 we won't have to check space for encodable characters) */
6068 respos = str - PyBytes_AS_STRING(res);
6069 repsize = PyUnicode_GET_SIZE(repunicode);
6070 requiredsize = respos+repsize+(endp-collend);
6071 if (requiredsize > ressize) {
6072 if (requiredsize<2*ressize)
6073 requiredsize = 2*ressize;
6074 if (_PyBytes_Resize(&res, requiredsize)) {
6075 Py_DECREF(repunicode);
6076 goto onError;
6077 }
6078 str = PyBytes_AS_STRING(res) + respos;
6079 ressize = requiredsize;
6080 }
6081 /* check if there is anything unencodable in the replacement
6082 and copy it to the output */
6083 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6084 c = *uni2;
6085 if (c >= limit) {
6086 raise_encode_exception(&exc, encoding, startp, size,
6087 unicodepos, unicodepos+1, reason);
6088 Py_DECREF(repunicode);
6089 goto onError;
6090 }
6091 *str = (char)c;
6092 }
6093 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006094 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006095 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006096 }
6097 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006098 /* Resize if we allocated to much */
6099 size = str - PyBytes_AS_STRING(res);
6100 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006101 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006102 if (_PyBytes_Resize(&res, size) < 0)
6103 goto onError;
6104 }
6105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106 Py_XDECREF(errorHandler);
6107 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006108 return res;
6109
6110 onError:
6111 Py_XDECREF(res);
6112 Py_XDECREF(errorHandler);
6113 Py_XDECREF(exc);
6114 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006115}
6116
Alexander Belopolsky40018472011-02-26 01:02:56 +00006117PyObject *
6118PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006119 Py_ssize_t size,
6120 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123}
6124
Alexander Belopolsky40018472011-02-26 01:02:56 +00006125PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006126_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127{
6128 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 PyErr_BadArgument();
6130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006132 if (PyUnicode_READY(unicode) == -1)
6133 return NULL;
6134 /* Fast path: if it is a one-byte string, construct
6135 bytes object directly. */
6136 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6137 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6138 PyUnicode_GET_LENGTH(unicode));
6139 /* Non-Latin-1 characters present. Defer to above function to
6140 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006143 errors);
6144}
6145
6146PyObject*
6147PyUnicode_AsLatin1String(PyObject *unicode)
6148{
6149 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150}
6151
6152/* --- 7-bit ASCII Codec -------------------------------------------------- */
6153
Alexander Belopolsky40018472011-02-26 01:02:56 +00006154PyObject *
6155PyUnicode_DecodeASCII(const char *s,
6156 Py_ssize_t size,
6157 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006159 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 PyUnicodeObject *v;
6161 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006162 Py_ssize_t startinpos;
6163 Py_ssize_t endinpos;
6164 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006165 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006166 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006167 PyObject *errorHandler = NULL;
6168 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006169 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006170
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006172 if (size == 1 && *(unsigned char*)s < 128)
6173 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6174
6175 /* Fast path. Assume the input actually *is* ASCII, and allocate
6176 a single-block Unicode object with that assumption. If there is
6177 an error, drop the object and start over. */
6178 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6179 if (v == NULL)
6180 goto onError;
6181 d = PyUnicode_1BYTE_DATA(v);
6182 for (i = 0; i < size; i++) {
6183 unsigned char ch = ((unsigned char*)s)[i];
6184 if (ch < 128)
6185 d[i] = ch;
6186 else
6187 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006189 if (i == size)
6190 return (PyObject*)v;
6191 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006192
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 v = _PyUnicode_New(size);
6194 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006199 e = s + size;
6200 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 register unsigned char c = (unsigned char)*s;
6202 if (c < 128) {
6203 *p++ = c;
6204 ++s;
6205 }
6206 else {
6207 startinpos = s-starts;
6208 endinpos = startinpos + 1;
6209 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6210 if (unicode_decode_call_errorhandler(
6211 errors, &errorHandler,
6212 "ascii", "ordinal not in range(128)",
6213 &starts, &e, &startinpos, &endinpos, &exc, &s,
6214 &v, &outpos, &p))
6215 goto onError;
6216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006218 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6220 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006221 Py_XDECREF(errorHandler);
6222 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006223 if (PyUnicode_READY(v) == -1) {
6224 Py_DECREF(v);
6225 return NULL;
6226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006228
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006231 Py_XDECREF(errorHandler);
6232 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 return NULL;
6234}
6235
Alexander Belopolsky40018472011-02-26 01:02:56 +00006236PyObject *
6237PyUnicode_EncodeASCII(const Py_UNICODE *p,
6238 Py_ssize_t size,
6239 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006241 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242}
6243
Alexander Belopolsky40018472011-02-26 01:02:56 +00006244PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006245_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246{
6247 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 PyErr_BadArgument();
6249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006251 if (PyUnicode_READY(unicode) == -1)
6252 return NULL;
6253 /* Fast path: if it is an ASCII-only string, construct bytes object
6254 directly. Else defer to above function to raise the exception. */
6255 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6256 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6257 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006260 errors);
6261}
6262
6263PyObject *
6264PyUnicode_AsASCIIString(PyObject *unicode)
6265{
6266 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267}
6268
Victor Stinner99b95382011-07-04 14:23:54 +02006269#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006270
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006271/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006272
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006273#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006274#define NEED_RETRY
6275#endif
6276
6277/* XXX This code is limited to "true" double-byte encodings, as
6278 a) it assumes an incomplete character consists of a single byte, and
6279 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006281
Alexander Belopolsky40018472011-02-26 01:02:56 +00006282static int
6283is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006284{
6285 const char *curr = s + offset;
6286
6287 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 const char *prev = CharPrev(s, curr);
6289 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006290 }
6291 return 0;
6292}
6293
6294/*
6295 * Decode MBCS string into unicode object. If 'final' is set, converts
6296 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6297 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006298static int
6299decode_mbcs(PyUnicodeObject **v,
6300 const char *s, /* MBCS string */
6301 int size, /* sizeof MBCS string */
6302 int final,
6303 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006304{
6305 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006306 Py_ssize_t n;
6307 DWORD usize;
6308 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006309
6310 assert(size >= 0);
6311
Victor Stinner554f3f02010-06-16 23:33:54 +00006312 /* check and handle 'errors' arg */
6313 if (errors==NULL || strcmp(errors, "strict")==0)
6314 flags = MB_ERR_INVALID_CHARS;
6315 else if (strcmp(errors, "ignore")==0)
6316 flags = 0;
6317 else {
6318 PyErr_Format(PyExc_ValueError,
6319 "mbcs encoding does not support errors='%s'",
6320 errors);
6321 return -1;
6322 }
6323
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006324 /* Skip trailing lead-byte unless 'final' is set */
6325 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006327
6328 /* First get the size of the result */
6329 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006330 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6331 if (usize==0)
6332 goto mbcs_decode_error;
6333 } else
6334 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006335
6336 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 /* Create unicode object */
6338 *v = _PyUnicode_New(usize);
6339 if (*v == NULL)
6340 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006341 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006342 }
6343 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 /* Extend unicode object */
6345 n = PyUnicode_GET_SIZE(*v);
6346 if (_PyUnicode_Resize(v, n + usize) < 0)
6347 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006348 }
6349
6350 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006351 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006353 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6354 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006356 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006357 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006358
6359mbcs_decode_error:
6360 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6361 we raise a UnicodeDecodeError - else it is a 'generic'
6362 windows error
6363 */
6364 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6365 /* Ideally, we should get reason from FormatMessage - this
6366 is the Windows 2000 English version of the message
6367 */
6368 PyObject *exc = NULL;
6369 const char *reason = "No mapping for the Unicode character exists "
6370 "in the target multi-byte code page.";
6371 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6372 if (exc != NULL) {
6373 PyCodec_StrictErrors(exc);
6374 Py_DECREF(exc);
6375 }
6376 } else {
6377 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6378 }
6379 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006380}
6381
Alexander Belopolsky40018472011-02-26 01:02:56 +00006382PyObject *
6383PyUnicode_DecodeMBCSStateful(const char *s,
6384 Py_ssize_t size,
6385 const char *errors,
6386 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006387{
6388 PyUnicodeObject *v = NULL;
6389 int done;
6390
6391 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006393
6394#ifdef NEED_RETRY
6395 retry:
6396 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006397 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006398 else
6399#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006400 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006401
6402 if (done < 0) {
6403 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006405 }
6406
6407 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006409
6410#ifdef NEED_RETRY
6411 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 s += done;
6413 size -= done;
6414 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006415 }
6416#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006417 if (PyUnicode_READY(v) == -1) {
6418 Py_DECREF(v);
6419 return NULL;
6420 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006421 return (PyObject *)v;
6422}
6423
Alexander Belopolsky40018472011-02-26 01:02:56 +00006424PyObject *
6425PyUnicode_DecodeMBCS(const char *s,
6426 Py_ssize_t size,
6427 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006428{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006429 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6430}
6431
6432/*
6433 * Convert unicode into string object (MBCS).
6434 * Returns 0 if succeed, -1 otherwise.
6435 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006436static int
6437encode_mbcs(PyObject **repr,
6438 const Py_UNICODE *p, /* unicode */
6439 int size, /* size of unicode */
6440 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006441{
Victor Stinner554f3f02010-06-16 23:33:54 +00006442 BOOL usedDefaultChar = FALSE;
6443 BOOL *pusedDefaultChar;
6444 int mbcssize;
6445 Py_ssize_t n;
6446 PyObject *exc = NULL;
6447 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006448
6449 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006450
Victor Stinner554f3f02010-06-16 23:33:54 +00006451 /* check and handle 'errors' arg */
6452 if (errors==NULL || strcmp(errors, "strict")==0) {
6453 flags = WC_NO_BEST_FIT_CHARS;
6454 pusedDefaultChar = &usedDefaultChar;
6455 } else if (strcmp(errors, "replace")==0) {
6456 flags = 0;
6457 pusedDefaultChar = NULL;
6458 } else {
6459 PyErr_Format(PyExc_ValueError,
6460 "mbcs encoding does not support errors='%s'",
6461 errors);
6462 return -1;
6463 }
6464
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006465 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006466 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006467 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6468 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 if (mbcssize == 0) {
6470 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6471 return -1;
6472 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006473 /* If we used a default char, then we failed! */
6474 if (pusedDefaultChar && *pusedDefaultChar)
6475 goto mbcs_encode_error;
6476 } else {
6477 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006478 }
6479
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006480 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 /* Create string object */
6482 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6483 if (*repr == NULL)
6484 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006485 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006486 }
6487 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 /* Extend string object */
6489 n = PyBytes_Size(*repr);
6490 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6491 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006492 }
6493
6494 /* Do the conversion */
6495 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006497 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6498 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6500 return -1;
6501 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006502 if (pusedDefaultChar && *pusedDefaultChar)
6503 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006504 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006505 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006506
6507mbcs_encode_error:
6508 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6509 Py_XDECREF(exc);
6510 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006511}
6512
Alexander Belopolsky40018472011-02-26 01:02:56 +00006513PyObject *
6514PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6515 Py_ssize_t size,
6516 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006517{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006518 PyObject *repr = NULL;
6519 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006520
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006521#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006523 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006524 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006525 else
6526#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006527 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006528
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006529 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 Py_XDECREF(repr);
6531 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006532 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006533
6534#ifdef NEED_RETRY
6535 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 p += INT_MAX;
6537 size -= INT_MAX;
6538 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006539 }
6540#endif
6541
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006542 return repr;
6543}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006544
Alexander Belopolsky40018472011-02-26 01:02:56 +00006545PyObject *
6546PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006547{
6548 if (!PyUnicode_Check(unicode)) {
6549 PyErr_BadArgument();
6550 return NULL;
6551 }
6552 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 PyUnicode_GET_SIZE(unicode),
6554 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006555}
6556
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006557#undef NEED_RETRY
6558
Victor Stinner99b95382011-07-04 14:23:54 +02006559#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006560
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561/* --- Character Mapping Codec -------------------------------------------- */
6562
Alexander Belopolsky40018472011-02-26 01:02:56 +00006563PyObject *
6564PyUnicode_DecodeCharmap(const char *s,
6565 Py_ssize_t size,
6566 PyObject *mapping,
6567 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006569 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006570 Py_ssize_t startinpos;
6571 Py_ssize_t endinpos;
6572 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006573 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 PyUnicodeObject *v;
6575 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006576 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006577 PyObject *errorHandler = NULL;
6578 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006579 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006580 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006581
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 /* Default to Latin-1 */
6583 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585
6586 v = _PyUnicode_New(size);
6587 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006592 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006593 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 mapstring = PyUnicode_AS_UNICODE(mapping);
6595 maplen = PyUnicode_GET_SIZE(mapping);
6596 while (s < e) {
6597 unsigned char ch = *s;
6598 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 if (ch < maplen)
6601 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 if (x == 0xfffe) {
6604 /* undefined mapping */
6605 outpos = p-PyUnicode_AS_UNICODE(v);
6606 startinpos = s-starts;
6607 endinpos = startinpos+1;
6608 if (unicode_decode_call_errorhandler(
6609 errors, &errorHandler,
6610 "charmap", "character maps to <undefined>",
6611 &starts, &e, &startinpos, &endinpos, &exc, &s,
6612 &v, &outpos, &p)) {
6613 goto onError;
6614 }
6615 continue;
6616 }
6617 *p++ = x;
6618 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006619 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006620 }
6621 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 while (s < e) {
6623 unsigned char ch = *s;
6624 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006625
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6627 w = PyLong_FromLong((long)ch);
6628 if (w == NULL)
6629 goto onError;
6630 x = PyObject_GetItem(mapping, w);
6631 Py_DECREF(w);
6632 if (x == NULL) {
6633 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6634 /* No mapping found means: mapping is undefined. */
6635 PyErr_Clear();
6636 x = Py_None;
6637 Py_INCREF(x);
6638 } else
6639 goto onError;
6640 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006641
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 /* Apply mapping */
6643 if (PyLong_Check(x)) {
6644 long value = PyLong_AS_LONG(x);
6645 if (value < 0 || value > 65535) {
6646 PyErr_SetString(PyExc_TypeError,
6647 "character mapping must be in range(65536)");
6648 Py_DECREF(x);
6649 goto onError;
6650 }
6651 *p++ = (Py_UNICODE)value;
6652 }
6653 else if (x == Py_None) {
6654 /* undefined mapping */
6655 outpos = p-PyUnicode_AS_UNICODE(v);
6656 startinpos = s-starts;
6657 endinpos = startinpos+1;
6658 if (unicode_decode_call_errorhandler(
6659 errors, &errorHandler,
6660 "charmap", "character maps to <undefined>",
6661 &starts, &e, &startinpos, &endinpos, &exc, &s,
6662 &v, &outpos, &p)) {
6663 Py_DECREF(x);
6664 goto onError;
6665 }
6666 Py_DECREF(x);
6667 continue;
6668 }
6669 else if (PyUnicode_Check(x)) {
6670 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006671
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 if (targetsize == 1)
6673 /* 1-1 mapping */
6674 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006675
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 else if (targetsize > 1) {
6677 /* 1-n mapping */
6678 if (targetsize > extrachars) {
6679 /* resize first */
6680 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6681 Py_ssize_t needed = (targetsize - extrachars) + \
6682 (targetsize << 2);
6683 extrachars += needed;
6684 /* XXX overflow detection missing */
6685 if (_PyUnicode_Resize(&v,
6686 PyUnicode_GET_SIZE(v) + needed) < 0) {
6687 Py_DECREF(x);
6688 goto onError;
6689 }
6690 p = PyUnicode_AS_UNICODE(v) + oldpos;
6691 }
6692 Py_UNICODE_COPY(p,
6693 PyUnicode_AS_UNICODE(x),
6694 targetsize);
6695 p += targetsize;
6696 extrachars -= targetsize;
6697 }
6698 /* 1-0 mapping: skip the character */
6699 }
6700 else {
6701 /* wrong return value */
6702 PyErr_SetString(PyExc_TypeError,
6703 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006704 Py_DECREF(x);
6705 goto onError;
6706 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006707 Py_DECREF(x);
6708 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 }
6711 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006712 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6713 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 Py_XDECREF(errorHandler);
6715 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006716 if (PyUnicode_READY(v) == -1) {
6717 Py_DECREF(v);
6718 return NULL;
6719 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006721
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 Py_XDECREF(errorHandler);
6724 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 Py_XDECREF(v);
6726 return NULL;
6727}
6728
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006729/* Charmap encoding: the lookup table */
6730
Alexander Belopolsky40018472011-02-26 01:02:56 +00006731struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 PyObject_HEAD
6733 unsigned char level1[32];
6734 int count2, count3;
6735 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006736};
6737
6738static PyObject*
6739encoding_map_size(PyObject *obj, PyObject* args)
6740{
6741 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006742 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006744}
6745
6746static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006747 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 PyDoc_STR("Return the size (in bytes) of this object") },
6749 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006750};
6751
6752static void
6753encoding_map_dealloc(PyObject* o)
6754{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006755 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006756}
6757
6758static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006759 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 "EncodingMap", /*tp_name*/
6761 sizeof(struct encoding_map), /*tp_basicsize*/
6762 0, /*tp_itemsize*/
6763 /* methods */
6764 encoding_map_dealloc, /*tp_dealloc*/
6765 0, /*tp_print*/
6766 0, /*tp_getattr*/
6767 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006768 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 0, /*tp_repr*/
6770 0, /*tp_as_number*/
6771 0, /*tp_as_sequence*/
6772 0, /*tp_as_mapping*/
6773 0, /*tp_hash*/
6774 0, /*tp_call*/
6775 0, /*tp_str*/
6776 0, /*tp_getattro*/
6777 0, /*tp_setattro*/
6778 0, /*tp_as_buffer*/
6779 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6780 0, /*tp_doc*/
6781 0, /*tp_traverse*/
6782 0, /*tp_clear*/
6783 0, /*tp_richcompare*/
6784 0, /*tp_weaklistoffset*/
6785 0, /*tp_iter*/
6786 0, /*tp_iternext*/
6787 encoding_map_methods, /*tp_methods*/
6788 0, /*tp_members*/
6789 0, /*tp_getset*/
6790 0, /*tp_base*/
6791 0, /*tp_dict*/
6792 0, /*tp_descr_get*/
6793 0, /*tp_descr_set*/
6794 0, /*tp_dictoffset*/
6795 0, /*tp_init*/
6796 0, /*tp_alloc*/
6797 0, /*tp_new*/
6798 0, /*tp_free*/
6799 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006800};
6801
6802PyObject*
6803PyUnicode_BuildEncodingMap(PyObject* string)
6804{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006805 PyObject *result;
6806 struct encoding_map *mresult;
6807 int i;
6808 int need_dict = 0;
6809 unsigned char level1[32];
6810 unsigned char level2[512];
6811 unsigned char *mlevel1, *mlevel2, *mlevel3;
6812 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006813 int kind;
6814 void *data;
6815 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006817 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006818 PyErr_BadArgument();
6819 return NULL;
6820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006821 kind = PyUnicode_KIND(string);
6822 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006823 memset(level1, 0xFF, sizeof level1);
6824 memset(level2, 0xFF, sizeof level2);
6825
6826 /* If there isn't a one-to-one mapping of NULL to \0,
6827 or if there are non-BMP characters, we need to use
6828 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006829 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006830 need_dict = 1;
6831 for (i = 1; i < 256; i++) {
6832 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006833 ch = PyUnicode_READ(kind, data, i);
6834 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006835 need_dict = 1;
6836 break;
6837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006838 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006839 /* unmapped character */
6840 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006841 l1 = ch >> 11;
6842 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006843 if (level1[l1] == 0xFF)
6844 level1[l1] = count2++;
6845 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006846 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006847 }
6848
6849 if (count2 >= 0xFF || count3 >= 0xFF)
6850 need_dict = 1;
6851
6852 if (need_dict) {
6853 PyObject *result = PyDict_New();
6854 PyObject *key, *value;
6855 if (!result)
6856 return NULL;
6857 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006858 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006859 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006860 if (!key || !value)
6861 goto failed1;
6862 if (PyDict_SetItem(result, key, value) == -1)
6863 goto failed1;
6864 Py_DECREF(key);
6865 Py_DECREF(value);
6866 }
6867 return result;
6868 failed1:
6869 Py_XDECREF(key);
6870 Py_XDECREF(value);
6871 Py_DECREF(result);
6872 return NULL;
6873 }
6874
6875 /* Create a three-level trie */
6876 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6877 16*count2 + 128*count3 - 1);
6878 if (!result)
6879 return PyErr_NoMemory();
6880 PyObject_Init(result, &EncodingMapType);
6881 mresult = (struct encoding_map*)result;
6882 mresult->count2 = count2;
6883 mresult->count3 = count3;
6884 mlevel1 = mresult->level1;
6885 mlevel2 = mresult->level23;
6886 mlevel3 = mresult->level23 + 16*count2;
6887 memcpy(mlevel1, level1, 32);
6888 memset(mlevel2, 0xFF, 16*count2);
6889 memset(mlevel3, 0, 128*count3);
6890 count3 = 0;
6891 for (i = 1; i < 256; i++) {
6892 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006893 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006894 /* unmapped character */
6895 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006896 o1 = PyUnicode_READ(kind, data, i)>>11;
6897 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006898 i2 = 16*mlevel1[o1] + o2;
6899 if (mlevel2[i2] == 0xFF)
6900 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006901 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006902 i3 = 128*mlevel2[i2] + o3;
6903 mlevel3[i3] = i;
6904 }
6905 return result;
6906}
6907
6908static int
6909encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6910{
6911 struct encoding_map *map = (struct encoding_map*)mapping;
6912 int l1 = c>>11;
6913 int l2 = (c>>7) & 0xF;
6914 int l3 = c & 0x7F;
6915 int i;
6916
6917#ifdef Py_UNICODE_WIDE
6918 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006920 }
6921#endif
6922 if (c == 0)
6923 return 0;
6924 /* level 1*/
6925 i = map->level1[l1];
6926 if (i == 0xFF) {
6927 return -1;
6928 }
6929 /* level 2*/
6930 i = map->level23[16*i+l2];
6931 if (i == 0xFF) {
6932 return -1;
6933 }
6934 /* level 3 */
6935 i = map->level23[16*map->count2 + 128*i + l3];
6936 if (i == 0) {
6937 return -1;
6938 }
6939 return i;
6940}
6941
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006942/* Lookup the character ch in the mapping. If the character
6943 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006944 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006945static PyObject *
6946charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947{
Christian Heimes217cfd12007-12-02 14:31:20 +00006948 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006949 PyObject *x;
6950
6951 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006953 x = PyObject_GetItem(mapping, w);
6954 Py_DECREF(w);
6955 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006956 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6957 /* No mapping found means: mapping is undefined. */
6958 PyErr_Clear();
6959 x = Py_None;
6960 Py_INCREF(x);
6961 return x;
6962 } else
6963 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006965 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006967 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 long value = PyLong_AS_LONG(x);
6969 if (value < 0 || value > 255) {
6970 PyErr_SetString(PyExc_TypeError,
6971 "character mapping must be in range(256)");
6972 Py_DECREF(x);
6973 return NULL;
6974 }
6975 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006977 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006980 /* wrong return value */
6981 PyErr_Format(PyExc_TypeError,
6982 "character mapping must return integer, bytes or None, not %.400s",
6983 x->ob_type->tp_name);
6984 Py_DECREF(x);
6985 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 }
6987}
6988
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006989static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006990charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006991{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006992 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6993 /* exponentially overallocate to minimize reallocations */
6994 if (requiredsize < 2*outsize)
6995 requiredsize = 2*outsize;
6996 if (_PyBytes_Resize(outobj, requiredsize))
6997 return -1;
6998 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006999}
7000
Benjamin Peterson14339b62009-01-31 16:36:08 +00007001typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007002 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007003} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007004/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007005 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007006 space is available. Return a new reference to the object that
7007 was put in the output buffer, or Py_None, if the mapping was undefined
7008 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007009 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007010static charmapencode_result
7011charmapencode_output(Py_UNICODE c, PyObject *mapping,
7012 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007014 PyObject *rep;
7015 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007016 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007017
Christian Heimes90aa7642007-12-19 02:45:37 +00007018 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007019 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007021 if (res == -1)
7022 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 if (outsize<requiredsize)
7024 if (charmapencode_resize(outobj, outpos, requiredsize))
7025 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007026 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007027 outstart[(*outpos)++] = (char)res;
7028 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007029 }
7030
7031 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007032 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007033 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007034 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 Py_DECREF(rep);
7036 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007037 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 if (PyLong_Check(rep)) {
7039 Py_ssize_t requiredsize = *outpos+1;
7040 if (outsize<requiredsize)
7041 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7042 Py_DECREF(rep);
7043 return enc_EXCEPTION;
7044 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007045 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007046 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007047 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 else {
7049 const char *repchars = PyBytes_AS_STRING(rep);
7050 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7051 Py_ssize_t requiredsize = *outpos+repsize;
7052 if (outsize<requiredsize)
7053 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7054 Py_DECREF(rep);
7055 return enc_EXCEPTION;
7056 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007057 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007058 memcpy(outstart + *outpos, repchars, repsize);
7059 *outpos += repsize;
7060 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007061 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007062 Py_DECREF(rep);
7063 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007064}
7065
7066/* handle an error in PyUnicode_EncodeCharmap
7067 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007068static int
7069charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007070 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007071 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007072 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007073 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007074{
7075 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007076 Py_ssize_t repsize;
7077 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007078 Py_UNICODE *uni2;
7079 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007080 Py_ssize_t collstartpos = *inpos;
7081 Py_ssize_t collendpos = *inpos+1;
7082 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007083 char *encoding = "charmap";
7084 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007085 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007086
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007087 /* find all unencodable characters */
7088 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007089 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007090 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 int res = encoding_map_lookup(p[collendpos], mapping);
7092 if (res != -1)
7093 break;
7094 ++collendpos;
7095 continue;
7096 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007097
Benjamin Peterson29060642009-01-31 22:14:21 +00007098 rep = charmapencode_lookup(p[collendpos], mapping);
7099 if (rep==NULL)
7100 return -1;
7101 else if (rep!=Py_None) {
7102 Py_DECREF(rep);
7103 break;
7104 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007105 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007107 }
7108 /* cache callback name lookup
7109 * (if not done yet, i.e. it's the first error) */
7110 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007111 if ((errors==NULL) || (!strcmp(errors, "strict")))
7112 *known_errorHandler = 1;
7113 else if (!strcmp(errors, "replace"))
7114 *known_errorHandler = 2;
7115 else if (!strcmp(errors, "ignore"))
7116 *known_errorHandler = 3;
7117 else if (!strcmp(errors, "xmlcharrefreplace"))
7118 *known_errorHandler = 4;
7119 else
7120 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007121 }
7122 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007123 case 1: /* strict */
7124 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7125 return -1;
7126 case 2: /* replace */
7127 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 x = charmapencode_output('?', mapping, res, respos);
7129 if (x==enc_EXCEPTION) {
7130 return -1;
7131 }
7132 else if (x==enc_FAILED) {
7133 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7134 return -1;
7135 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007136 }
7137 /* fall through */
7138 case 3: /* ignore */
7139 *inpos = collendpos;
7140 break;
7141 case 4: /* xmlcharrefreplace */
7142 /* generate replacement (temporarily (mis)uses p) */
7143 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007144 char buffer[2+29+1+1];
7145 char *cp;
7146 sprintf(buffer, "&#%d;", (int)p[collpos]);
7147 for (cp = buffer; *cp; ++cp) {
7148 x = charmapencode_output(*cp, mapping, res, respos);
7149 if (x==enc_EXCEPTION)
7150 return -1;
7151 else if (x==enc_FAILED) {
7152 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7153 return -1;
7154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007155 }
7156 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007157 *inpos = collendpos;
7158 break;
7159 default:
7160 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007161 encoding, reason, p, size, exceptionObject,
7162 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007163 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007164 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007165 if (PyBytes_Check(repunicode)) {
7166 /* Directly copy bytes result to output. */
7167 Py_ssize_t outsize = PyBytes_Size(*res);
7168 Py_ssize_t requiredsize;
7169 repsize = PyBytes_Size(repunicode);
7170 requiredsize = *respos + repsize;
7171 if (requiredsize > outsize)
7172 /* Make room for all additional bytes. */
7173 if (charmapencode_resize(res, respos, requiredsize)) {
7174 Py_DECREF(repunicode);
7175 return -1;
7176 }
7177 memcpy(PyBytes_AsString(*res) + *respos,
7178 PyBytes_AsString(repunicode), repsize);
7179 *respos += repsize;
7180 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007181 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007182 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007183 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007184 /* generate replacement */
7185 repsize = PyUnicode_GET_SIZE(repunicode);
7186 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007187 x = charmapencode_output(*uni2, mapping, res, respos);
7188 if (x==enc_EXCEPTION) {
7189 return -1;
7190 }
7191 else if (x==enc_FAILED) {
7192 Py_DECREF(repunicode);
7193 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7194 return -1;
7195 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007196 }
7197 *inpos = newpos;
7198 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199 }
7200 return 0;
7201}
7202
Alexander Belopolsky40018472011-02-26 01:02:56 +00007203PyObject *
7204PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7205 Py_ssize_t size,
7206 PyObject *mapping,
7207 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007209 /* output object */
7210 PyObject *res = NULL;
7211 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007212 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007213 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007214 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007215 PyObject *errorHandler = NULL;
7216 PyObject *exc = NULL;
7217 /* the following variable is used for caching string comparisons
7218 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7219 * 3=ignore, 4=xmlcharrefreplace */
7220 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221
7222 /* Default to Latin-1 */
7223 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007226 /* allocate enough for a simple encoding without
7227 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007228 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007229 if (res == NULL)
7230 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007231 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007234 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007235 /* try to encode it */
7236 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7237 if (x==enc_EXCEPTION) /* error */
7238 goto onError;
7239 if (x==enc_FAILED) { /* unencodable character */
7240 if (charmap_encoding_error(p, size, &inpos, mapping,
7241 &exc,
7242 &known_errorHandler, &errorHandler, errors,
7243 &res, &respos)) {
7244 goto onError;
7245 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007246 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 else
7248 /* done with this character => adjust input position */
7249 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007252 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007253 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007254 if (_PyBytes_Resize(&res, respos) < 0)
7255 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007256
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007257 Py_XDECREF(exc);
7258 Py_XDECREF(errorHandler);
7259 return res;
7260
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007262 Py_XDECREF(res);
7263 Py_XDECREF(exc);
7264 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265 return NULL;
7266}
7267
Alexander Belopolsky40018472011-02-26 01:02:56 +00007268PyObject *
7269PyUnicode_AsCharmapString(PyObject *unicode,
7270 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271{
7272 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 PyErr_BadArgument();
7274 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 }
7276 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 PyUnicode_GET_SIZE(unicode),
7278 mapping,
7279 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280}
7281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007282/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007283static void
7284make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007285 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007286 Py_ssize_t startpos, Py_ssize_t endpos,
7287 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007289 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007290 *exceptionObject = _PyUnicodeTranslateError_Create(
7291 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292 }
7293 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007294 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7295 goto onError;
7296 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7297 goto onError;
7298 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7299 goto onError;
7300 return;
7301 onError:
7302 Py_DECREF(*exceptionObject);
7303 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 }
7305}
7306
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007307/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007308static void
7309raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007310 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007311 Py_ssize_t startpos, Py_ssize_t endpos,
7312 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007313{
7314 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007315 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007316 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007318}
7319
7320/* error handling callback helper:
7321 build arguments, call the callback and check the arguments,
7322 put the result into newpos and return the replacement string, which
7323 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007324static PyObject *
7325unicode_translate_call_errorhandler(const char *errors,
7326 PyObject **errorHandler,
7327 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007328 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007329 Py_ssize_t startpos, Py_ssize_t endpos,
7330 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007331{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007332 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007333
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007334 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007335 PyObject *restuple;
7336 PyObject *resunicode;
7337
7338 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007340 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007342 }
7343
7344 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007345 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007346 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007348
7349 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007351 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007353 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007354 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 Py_DECREF(restuple);
7356 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007357 }
7358 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 &resunicode, &i_newpos)) {
7360 Py_DECREF(restuple);
7361 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007362 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007363 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007364 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007365 else
7366 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007367 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7369 Py_DECREF(restuple);
7370 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007371 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007372 Py_INCREF(resunicode);
7373 Py_DECREF(restuple);
7374 return resunicode;
7375}
7376
7377/* Lookup the character ch in the mapping and put the result in result,
7378 which must be decrefed by the caller.
7379 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007380static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007381charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007382{
Christian Heimes217cfd12007-12-02 14:31:20 +00007383 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007384 PyObject *x;
7385
7386 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007388 x = PyObject_GetItem(mapping, w);
7389 Py_DECREF(w);
7390 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7392 /* No mapping found means: use 1:1 mapping. */
7393 PyErr_Clear();
7394 *result = NULL;
7395 return 0;
7396 } else
7397 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007398 }
7399 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 *result = x;
7401 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007402 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007403 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007404 long value = PyLong_AS_LONG(x);
7405 long max = PyUnicode_GetMax();
7406 if (value < 0 || value > max) {
7407 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007408 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 Py_DECREF(x);
7410 return -1;
7411 }
7412 *result = x;
7413 return 0;
7414 }
7415 else if (PyUnicode_Check(x)) {
7416 *result = x;
7417 return 0;
7418 }
7419 else {
7420 /* wrong return value */
7421 PyErr_SetString(PyExc_TypeError,
7422 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007423 Py_DECREF(x);
7424 return -1;
7425 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007426}
7427/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 if not reallocate and adjust various state variables.
7429 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007430static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007431charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007433{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007434 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007435 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 /* exponentially overallocate to minimize reallocations */
7437 if (requiredsize < 2 * oldsize)
7438 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007439 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7440 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007442 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007443 }
7444 return 0;
7445}
7446/* lookup the character, put the result in the output string and adjust
7447 various state variables. Return a new reference to the object that
7448 was put in the output buffer in *result, or Py_None, if the mapping was
7449 undefined (in which case no character was written).
7450 The called must decref result.
7451 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007452static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007453charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7454 PyObject *mapping, Py_UCS4 **output,
7455 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007456 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007458 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7459 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007461 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007463 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007464 }
7465 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007467 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007469 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007470 }
7471 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007472 Py_ssize_t repsize;
7473 if (PyUnicode_READY(*res) == -1)
7474 return -1;
7475 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 if (repsize==1) {
7477 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007478 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 }
7480 else if (repsize!=0) {
7481 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007482 Py_ssize_t requiredsize = *opos +
7483 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007485 Py_ssize_t i;
7486 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007488 for(i = 0; i < repsize; i++)
7489 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007491 }
7492 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007494 return 0;
7495}
7496
Alexander Belopolsky40018472011-02-26 01:02:56 +00007497PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007498_PyUnicode_TranslateCharmap(PyObject *input,
7499 PyObject *mapping,
7500 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007502 /* input object */
7503 char *idata;
7504 Py_ssize_t size, i;
7505 int kind;
7506 /* output buffer */
7507 Py_UCS4 *output = NULL;
7508 Py_ssize_t osize;
7509 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007510 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007511 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007512 char *reason = "character maps to <undefined>";
7513 PyObject *errorHandler = NULL;
7514 PyObject *exc = NULL;
7515 /* the following variable is used for caching string comparisons
7516 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7517 * 3=ignore, 4=xmlcharrefreplace */
7518 int known_errorHandler = -1;
7519
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 PyErr_BadArgument();
7522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007525 if (PyUnicode_READY(input) == -1)
7526 return NULL;
7527 idata = (char*)PyUnicode_DATA(input);
7528 kind = PyUnicode_KIND(input);
7529 size = PyUnicode_GET_LENGTH(input);
7530 i = 0;
7531
7532 if (size == 0) {
7533 Py_INCREF(input);
7534 return input;
7535 }
7536
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007537 /* allocate enough for a simple 1:1 translation without
7538 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007539 osize = size;
7540 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7541 opos = 0;
7542 if (output == NULL) {
7543 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007545 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007547 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007548 /* try to encode it */
7549 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007550 if (charmaptranslate_output(input, i, mapping,
7551 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 Py_XDECREF(x);
7553 goto onError;
7554 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007555 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007557 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 else { /* untranslatable character */
7559 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7560 Py_ssize_t repsize;
7561 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007562 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007563 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007564 Py_ssize_t collstart = i;
7565 Py_ssize_t collend = i+1;
7566 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007569 while (collend < size) {
7570 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007571 goto onError;
7572 Py_XDECREF(x);
7573 if (x!=Py_None)
7574 break;
7575 ++collend;
7576 }
7577 /* cache callback name lookup
7578 * (if not done yet, i.e. it's the first error) */
7579 if (known_errorHandler==-1) {
7580 if ((errors==NULL) || (!strcmp(errors, "strict")))
7581 known_errorHandler = 1;
7582 else if (!strcmp(errors, "replace"))
7583 known_errorHandler = 2;
7584 else if (!strcmp(errors, "ignore"))
7585 known_errorHandler = 3;
7586 else if (!strcmp(errors, "xmlcharrefreplace"))
7587 known_errorHandler = 4;
7588 else
7589 known_errorHandler = 0;
7590 }
7591 switch (known_errorHandler) {
7592 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007593 raise_translate_exception(&exc, input, collstart,
7594 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007595 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 case 2: /* replace */
7597 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007598 for (coll = collstart; coll<collend; coll++)
7599 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 /* fall through */
7601 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007602 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 break;
7604 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007605 /* generate replacement (temporarily (mis)uses i) */
7606 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 char buffer[2+29+1+1];
7608 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007609 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7610 if (charmaptranslate_makespace(&output, &osize,
7611 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 goto onError;
7613 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007614 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007616 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 break;
7618 default:
7619 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007620 reason, input, &exc,
7621 collstart, collend, &newpos);
7622 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 goto onError;
7624 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007625 repsize = PyUnicode_GET_LENGTH(repunicode);
7626 if (charmaptranslate_makespace(&output, &osize,
7627 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007628 Py_DECREF(repunicode);
7629 goto onError;
7630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007631 for (uni2 = 0; repsize-->0; ++uni2)
7632 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7633 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007635 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007636 }
7637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007638 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7639 if (!res)
7640 goto onError;
7641 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007642 Py_XDECREF(exc);
7643 Py_XDECREF(errorHandler);
7644 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007647 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007648 Py_XDECREF(exc);
7649 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650 return NULL;
7651}
7652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007653/* Deprecated. Use PyUnicode_Translate instead. */
7654PyObject *
7655PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7656 Py_ssize_t size,
7657 PyObject *mapping,
7658 const char *errors)
7659{
7660 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7661 if (!unicode)
7662 return NULL;
7663 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7664}
7665
Alexander Belopolsky40018472011-02-26 01:02:56 +00007666PyObject *
7667PyUnicode_Translate(PyObject *str,
7668 PyObject *mapping,
7669 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670{
7671 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007672
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673 str = PyUnicode_FromObject(str);
7674 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007676 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677 Py_DECREF(str);
7678 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007679
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681 Py_XDECREF(str);
7682 return NULL;
7683}
Tim Petersced69f82003-09-16 20:30:58 +00007684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007685static Py_UCS4
7686fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7687{
7688 /* No need to call PyUnicode_READY(self) because this function is only
7689 called as a callback from fixup() which does it already. */
7690 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7691 const int kind = PyUnicode_KIND(self);
7692 void *data = PyUnicode_DATA(self);
7693 Py_UCS4 maxchar = 0, ch, fixed;
7694 Py_ssize_t i;
7695
7696 for (i = 0; i < len; ++i) {
7697 ch = PyUnicode_READ(kind, data, i);
7698 fixed = 0;
7699 if (ch > 127) {
7700 if (Py_UNICODE_ISSPACE(ch))
7701 fixed = ' ';
7702 else {
7703 const int decimal = Py_UNICODE_TODECIMAL(ch);
7704 if (decimal >= 0)
7705 fixed = '0' + decimal;
7706 }
7707 if (fixed != 0) {
7708 if (fixed > maxchar)
7709 maxchar = fixed;
7710 PyUnicode_WRITE(kind, data, i, fixed);
7711 }
7712 else if (ch > maxchar)
7713 maxchar = ch;
7714 }
7715 else if (ch > maxchar)
7716 maxchar = ch;
7717 }
7718
7719 return maxchar;
7720}
7721
7722PyObject *
7723_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7724{
7725 if (!PyUnicode_Check(unicode)) {
7726 PyErr_BadInternalCall();
7727 return NULL;
7728 }
7729 if (PyUnicode_READY(unicode) == -1)
7730 return NULL;
7731 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7732 /* If the string is already ASCII, just return the same string */
7733 Py_INCREF(unicode);
7734 return unicode;
7735 }
7736 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7737}
7738
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007739PyObject *
7740PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7741 Py_ssize_t length)
7742{
7743 PyObject *result;
7744 Py_UNICODE *p; /* write pointer into result */
7745 Py_ssize_t i;
7746 /* Copy to a new string */
7747 result = (PyObject *)_PyUnicode_New(length);
7748 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7749 if (result == NULL)
7750 return result;
7751 p = PyUnicode_AS_UNICODE(result);
7752 /* Iterate over code points */
7753 for (i = 0; i < length; i++) {
7754 Py_UNICODE ch =s[i];
7755 if (ch > 127) {
7756 int decimal = Py_UNICODE_TODECIMAL(ch);
7757 if (decimal >= 0)
7758 p[i] = '0' + decimal;
7759 }
7760 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007761 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7762 Py_DECREF(result);
7763 return NULL;
7764 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007765 return result;
7766}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007767/* --- Decimal Encoder ---------------------------------------------------- */
7768
Alexander Belopolsky40018472011-02-26 01:02:56 +00007769int
7770PyUnicode_EncodeDecimal(Py_UNICODE *s,
7771 Py_ssize_t length,
7772 char *output,
7773 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007774{
7775 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007776 PyObject *errorHandler = NULL;
7777 PyObject *exc = NULL;
7778 const char *encoding = "decimal";
7779 const char *reason = "invalid decimal Unicode string";
7780 /* the following variable is used for caching string comparisons
7781 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7782 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007783
7784 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 PyErr_BadArgument();
7786 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007787 }
7788
7789 p = s;
7790 end = s + length;
7791 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 register Py_UNICODE ch = *p;
7793 int decimal;
7794 PyObject *repunicode;
7795 Py_ssize_t repsize;
7796 Py_ssize_t newpos;
7797 Py_UNICODE *uni2;
7798 Py_UNICODE *collstart;
7799 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007800
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007802 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 ++p;
7804 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007805 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 decimal = Py_UNICODE_TODECIMAL(ch);
7807 if (decimal >= 0) {
7808 *output++ = '0' + decimal;
7809 ++p;
7810 continue;
7811 }
7812 if (0 < ch && ch < 256) {
7813 *output++ = (char)ch;
7814 ++p;
7815 continue;
7816 }
7817 /* All other characters are considered unencodable */
7818 collstart = p;
7819 collend = p+1;
7820 while (collend < end) {
7821 if ((0 < *collend && *collend < 256) ||
7822 !Py_UNICODE_ISSPACE(*collend) ||
7823 Py_UNICODE_TODECIMAL(*collend))
7824 break;
7825 }
7826 /* cache callback name lookup
7827 * (if not done yet, i.e. it's the first error) */
7828 if (known_errorHandler==-1) {
7829 if ((errors==NULL) || (!strcmp(errors, "strict")))
7830 known_errorHandler = 1;
7831 else if (!strcmp(errors, "replace"))
7832 known_errorHandler = 2;
7833 else if (!strcmp(errors, "ignore"))
7834 known_errorHandler = 3;
7835 else if (!strcmp(errors, "xmlcharrefreplace"))
7836 known_errorHandler = 4;
7837 else
7838 known_errorHandler = 0;
7839 }
7840 switch (known_errorHandler) {
7841 case 1: /* strict */
7842 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7843 goto onError;
7844 case 2: /* replace */
7845 for (p = collstart; p < collend; ++p)
7846 *output++ = '?';
7847 /* fall through */
7848 case 3: /* ignore */
7849 p = collend;
7850 break;
7851 case 4: /* xmlcharrefreplace */
7852 /* generate replacement (temporarily (mis)uses p) */
7853 for (p = collstart; p < collend; ++p)
7854 output += sprintf(output, "&#%d;", (int)*p);
7855 p = collend;
7856 break;
7857 default:
7858 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7859 encoding, reason, s, length, &exc,
7860 collstart-s, collend-s, &newpos);
7861 if (repunicode == NULL)
7862 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007863 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007864 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007865 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7866 Py_DECREF(repunicode);
7867 goto onError;
7868 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 /* generate replacement */
7870 repsize = PyUnicode_GET_SIZE(repunicode);
7871 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7872 Py_UNICODE ch = *uni2;
7873 if (Py_UNICODE_ISSPACE(ch))
7874 *output++ = ' ';
7875 else {
7876 decimal = Py_UNICODE_TODECIMAL(ch);
7877 if (decimal >= 0)
7878 *output++ = '0' + decimal;
7879 else if (0 < ch && ch < 256)
7880 *output++ = (char)ch;
7881 else {
7882 Py_DECREF(repunicode);
7883 raise_encode_exception(&exc, encoding,
7884 s, length, collstart-s, collend-s, reason);
7885 goto onError;
7886 }
7887 }
7888 }
7889 p = s + newpos;
7890 Py_DECREF(repunicode);
7891 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007892 }
7893 /* 0-terminate the output string */
7894 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007895 Py_XDECREF(exc);
7896 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007897 return 0;
7898
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007900 Py_XDECREF(exc);
7901 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007902 return -1;
7903}
7904
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905/* --- Helpers ------------------------------------------------------------ */
7906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007907#include "stringlib/ucs1lib.h"
7908#include "stringlib/fastsearch.h"
7909#include "stringlib/partition.h"
7910#include "stringlib/split.h"
7911#include "stringlib/count.h"
7912#include "stringlib/find.h"
7913#include "stringlib/localeutil.h"
7914#include "stringlib/undef.h"
7915
7916#include "stringlib/ucs2lib.h"
7917#include "stringlib/fastsearch.h"
7918#include "stringlib/partition.h"
7919#include "stringlib/split.h"
7920#include "stringlib/count.h"
7921#include "stringlib/find.h"
7922#include "stringlib/localeutil.h"
7923#include "stringlib/undef.h"
7924
7925#include "stringlib/ucs4lib.h"
7926#include "stringlib/fastsearch.h"
7927#include "stringlib/partition.h"
7928#include "stringlib/split.h"
7929#include "stringlib/count.h"
7930#include "stringlib/find.h"
7931#include "stringlib/localeutil.h"
7932#include "stringlib/undef.h"
7933
7934static Py_ssize_t
7935any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7936 const Py_UCS1*, Py_ssize_t,
7937 Py_ssize_t, Py_ssize_t),
7938 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7939 const Py_UCS2*, Py_ssize_t,
7940 Py_ssize_t, Py_ssize_t),
7941 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7942 const Py_UCS4*, Py_ssize_t,
7943 Py_ssize_t, Py_ssize_t),
7944 PyObject* s1, PyObject* s2,
7945 Py_ssize_t start,
7946 Py_ssize_t end)
7947{
7948 int kind1, kind2, kind;
7949 void *buf1, *buf2;
7950 Py_ssize_t len1, len2, result;
7951
7952 kind1 = PyUnicode_KIND(s1);
7953 kind2 = PyUnicode_KIND(s2);
7954 kind = kind1 > kind2 ? kind1 : kind2;
7955 buf1 = PyUnicode_DATA(s1);
7956 buf2 = PyUnicode_DATA(s2);
7957 if (kind1 != kind)
7958 buf1 = _PyUnicode_AsKind(s1, kind);
7959 if (!buf1)
7960 return -2;
7961 if (kind2 != kind)
7962 buf2 = _PyUnicode_AsKind(s2, kind);
7963 if (!buf2) {
7964 if (kind1 != kind) PyMem_Free(buf1);
7965 return -2;
7966 }
7967 len1 = PyUnicode_GET_LENGTH(s1);
7968 len2 = PyUnicode_GET_LENGTH(s2);
7969
7970 switch(kind) {
7971 case PyUnicode_1BYTE_KIND:
7972 result = ucs1(buf1, len1, buf2, len2, start, end);
7973 break;
7974 case PyUnicode_2BYTE_KIND:
7975 result = ucs2(buf1, len1, buf2, len2, start, end);
7976 break;
7977 case PyUnicode_4BYTE_KIND:
7978 result = ucs4(buf1, len1, buf2, len2, start, end);
7979 break;
7980 default:
7981 assert(0); result = -2;
7982 }
7983
7984 if (kind1 != kind)
7985 PyMem_Free(buf1);
7986 if (kind2 != kind)
7987 PyMem_Free(buf2);
7988
7989 return result;
7990}
7991
7992Py_ssize_t
7993_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7994 Py_ssize_t n_buffer,
7995 void *digits, Py_ssize_t n_digits,
7996 Py_ssize_t min_width,
7997 const char *grouping,
7998 const char *thousands_sep)
7999{
8000 switch(kind) {
8001 case PyUnicode_1BYTE_KIND:
8002 return _PyUnicode_ucs1_InsertThousandsGrouping(
8003 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8004 min_width, grouping, thousands_sep);
8005 case PyUnicode_2BYTE_KIND:
8006 return _PyUnicode_ucs2_InsertThousandsGrouping(
8007 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8008 min_width, grouping, thousands_sep);
8009 case PyUnicode_4BYTE_KIND:
8010 return _PyUnicode_ucs4_InsertThousandsGrouping(
8011 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8012 min_width, grouping, thousands_sep);
8013 }
8014 assert(0);
8015 return -1;
8016}
8017
8018
Eric Smith8c663262007-08-25 02:26:07 +00008019#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008020#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008021
Thomas Wouters477c8d52006-05-27 19:21:47 +00008022#include "stringlib/count.h"
8023#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008024
Thomas Wouters477c8d52006-05-27 19:21:47 +00008025/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008026#define ADJUST_INDICES(start, end, len) \
8027 if (end > len) \
8028 end = len; \
8029 else if (end < 0) { \
8030 end += len; \
8031 if (end < 0) \
8032 end = 0; \
8033 } \
8034 if (start < 0) { \
8035 start += len; \
8036 if (start < 0) \
8037 start = 0; \
8038 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008039
Alexander Belopolsky40018472011-02-26 01:02:56 +00008040Py_ssize_t
8041PyUnicode_Count(PyObject *str,
8042 PyObject *substr,
8043 Py_ssize_t start,
8044 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008046 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008047 PyUnicodeObject* str_obj;
8048 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008049 int kind1, kind2, kind;
8050 void *buf1 = NULL, *buf2 = NULL;
8051 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008052
Thomas Wouters477c8d52006-05-27 19:21:47 +00008053 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008054 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008056 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008057 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 Py_DECREF(str_obj);
8059 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 }
Tim Petersced69f82003-09-16 20:30:58 +00008061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008062 kind1 = PyUnicode_KIND(str_obj);
8063 kind2 = PyUnicode_KIND(sub_obj);
8064 kind = kind1 > kind2 ? kind1 : kind2;
8065 buf1 = PyUnicode_DATA(str_obj);
8066 if (kind1 != kind)
8067 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8068 if (!buf1)
8069 goto onError;
8070 buf2 = PyUnicode_DATA(sub_obj);
8071 if (kind2 != kind)
8072 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8073 if (!buf2)
8074 goto onError;
8075 len1 = PyUnicode_GET_LENGTH(str_obj);
8076 len2 = PyUnicode_GET_LENGTH(sub_obj);
8077
8078 ADJUST_INDICES(start, end, len1);
8079 switch(kind) {
8080 case PyUnicode_1BYTE_KIND:
8081 result = ucs1lib_count(
8082 ((Py_UCS1*)buf1) + start, end - start,
8083 buf2, len2, PY_SSIZE_T_MAX
8084 );
8085 break;
8086 case PyUnicode_2BYTE_KIND:
8087 result = ucs2lib_count(
8088 ((Py_UCS2*)buf1) + start, end - start,
8089 buf2, len2, PY_SSIZE_T_MAX
8090 );
8091 break;
8092 case PyUnicode_4BYTE_KIND:
8093 result = ucs4lib_count(
8094 ((Py_UCS4*)buf1) + start, end - start,
8095 buf2, len2, PY_SSIZE_T_MAX
8096 );
8097 break;
8098 default:
8099 assert(0); result = 0;
8100 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008101
8102 Py_DECREF(sub_obj);
8103 Py_DECREF(str_obj);
8104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008105 if (kind1 != kind)
8106 PyMem_Free(buf1);
8107 if (kind2 != kind)
8108 PyMem_Free(buf2);
8109
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008111 onError:
8112 Py_DECREF(sub_obj);
8113 Py_DECREF(str_obj);
8114 if (kind1 != kind && buf1)
8115 PyMem_Free(buf1);
8116 if (kind2 != kind && buf2)
8117 PyMem_Free(buf2);
8118 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119}
8120
Alexander Belopolsky40018472011-02-26 01:02:56 +00008121Py_ssize_t
8122PyUnicode_Find(PyObject *str,
8123 PyObject *sub,
8124 Py_ssize_t start,
8125 Py_ssize_t end,
8126 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008128 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008129
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008133 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008134 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 Py_DECREF(str);
8136 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 }
Tim Petersced69f82003-09-16 20:30:58 +00008138
Thomas Wouters477c8d52006-05-27 19:21:47 +00008139 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008140 result = any_find_slice(
8141 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8142 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008143 );
8144 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008145 result = any_find_slice(
8146 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8147 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008148 );
8149
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008151 Py_DECREF(sub);
8152
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 return result;
8154}
8155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156Py_ssize_t
8157PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8158 Py_ssize_t start, Py_ssize_t end,
8159 int direction)
8160{
8161 char *result;
8162 int kind;
8163 if (PyUnicode_READY(str) == -1)
8164 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008165 if (start < 0 || end < 0) {
8166 PyErr_SetString(PyExc_IndexError, "string index out of range");
8167 return -2;
8168 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008169 if (end > PyUnicode_GET_LENGTH(str))
8170 end = PyUnicode_GET_LENGTH(str);
8171 kind = PyUnicode_KIND(str);
8172 result = findchar(PyUnicode_1BYTE_DATA(str)
8173 + PyUnicode_KIND_SIZE(kind, start),
8174 kind,
8175 end-start, ch, direction);
8176 if (!result)
8177 return -1;
8178 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8179}
8180
Alexander Belopolsky40018472011-02-26 01:02:56 +00008181static int
8182tailmatch(PyUnicodeObject *self,
8183 PyUnicodeObject *substring,
8184 Py_ssize_t start,
8185 Py_ssize_t end,
8186 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008188 int kind_self;
8189 int kind_sub;
8190 void *data_self;
8191 void *data_sub;
8192 Py_ssize_t offset;
8193 Py_ssize_t i;
8194 Py_ssize_t end_sub;
8195
8196 if (PyUnicode_READY(self) == -1 ||
8197 PyUnicode_READY(substring) == -1)
8198 return 0;
8199
8200 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 return 1;
8202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008203 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8204 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008208 kind_self = PyUnicode_KIND(self);
8209 data_self = PyUnicode_DATA(self);
8210 kind_sub = PyUnicode_KIND(substring);
8211 data_sub = PyUnicode_DATA(substring);
8212 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8213
8214 if (direction > 0)
8215 offset = end;
8216 else
8217 offset = start;
8218
8219 if (PyUnicode_READ(kind_self, data_self, offset) ==
8220 PyUnicode_READ(kind_sub, data_sub, 0) &&
8221 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8222 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8223 /* If both are of the same kind, memcmp is sufficient */
8224 if (kind_self == kind_sub) {
8225 return ! memcmp((char *)data_self +
8226 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8227 data_sub,
8228 PyUnicode_GET_LENGTH(substring) *
8229 PyUnicode_CHARACTER_SIZE(substring));
8230 }
8231 /* otherwise we have to compare each character by first accesing it */
8232 else {
8233 /* We do not need to compare 0 and len(substring)-1 because
8234 the if statement above ensured already that they are equal
8235 when we end up here. */
8236 // TODO: honor direction and do a forward or backwards search
8237 for (i = 1; i < end_sub; ++i) {
8238 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8239 PyUnicode_READ(kind_sub, data_sub, i))
8240 return 0;
8241 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244 }
8245
8246 return 0;
8247}
8248
Alexander Belopolsky40018472011-02-26 01:02:56 +00008249Py_ssize_t
8250PyUnicode_Tailmatch(PyObject *str,
8251 PyObject *substr,
8252 Py_ssize_t start,
8253 Py_ssize_t end,
8254 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008256 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008257
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 str = PyUnicode_FromObject(str);
8259 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 substr = PyUnicode_FromObject(substr);
8262 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 Py_DECREF(str);
8264 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265 }
Tim Petersced69f82003-09-16 20:30:58 +00008266
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 (PyUnicodeObject *)substr,
8269 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270 Py_DECREF(str);
8271 Py_DECREF(substr);
8272 return result;
8273}
8274
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275/* Apply fixfct filter to the Unicode object self and return a
8276 reference to the modified object */
8277
Alexander Belopolsky40018472011-02-26 01:02:56 +00008278static PyObject *
8279fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008280 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008282 PyObject *u;
8283 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008285 if (PyUnicode_READY(self) == -1)
8286 return NULL;
8287 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8288 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8289 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008293 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8294 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008296 /* fix functions return the new maximum character in a string,
8297 if the kind of the resulting unicode object does not change,
8298 everything is fine. Otherwise we need to change the string kind
8299 and re-run the fix function. */
8300 maxchar_new = fixfct((PyUnicodeObject*)u);
8301 if (maxchar_new == 0)
8302 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8303 else if (maxchar_new <= 127)
8304 maxchar_new = 127;
8305 else if (maxchar_new <= 255)
8306 maxchar_new = 255;
8307 else if (maxchar_new <= 65535)
8308 maxchar_new = 65535;
8309 else
8310 maxchar_new = 1114111; /* 0x10ffff */
8311
8312 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 /* fixfct should return TRUE if it modified the buffer. If
8314 FALSE, return a reference to the original buffer instead
8315 (to save space, not time) */
8316 Py_INCREF(self);
8317 Py_DECREF(u);
8318 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008320 else if (maxchar_new == maxchar_old) {
8321 return u;
8322 }
8323 else {
8324 /* In case the maximum character changed, we need to
8325 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008326 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 if (v == NULL) {
8328 Py_DECREF(u);
8329 return NULL;
8330 }
8331 if (maxchar_new > maxchar_old) {
8332 /* If the maxchar increased so that the kind changed, not all
8333 characters are representable anymore and we need to fix the
8334 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008335 if (PyUnicode_CopyCharacters(v, 0,
8336 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008337 PyUnicode_GET_LENGTH(self)) < 0)
8338 {
8339 Py_DECREF(u);
8340 return NULL;
8341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342 maxchar_old = fixfct((PyUnicodeObject*)v);
8343 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8344 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008345 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008346 if (PyUnicode_CopyCharacters(v, 0,
8347 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008348 PyUnicode_GET_LENGTH(self)) < 0)
8349 {
8350 Py_DECREF(u);
8351 return NULL;
8352 }
8353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008354
8355 Py_DECREF(u);
8356 return v;
8357 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358}
8359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008361fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008363 /* No need to call PyUnicode_READY(self) because this function is only
8364 called as a callback from fixup() which does it already. */
8365 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8366 const int kind = PyUnicode_KIND(self);
8367 void *data = PyUnicode_DATA(self);
8368 int touched = 0;
8369 Py_UCS4 maxchar = 0;
8370 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 for (i = 0; i < len; ++i) {
8373 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8374 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8375 if (up != ch) {
8376 if (up > maxchar)
8377 maxchar = up;
8378 PyUnicode_WRITE(kind, data, i, up);
8379 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 else if (ch > maxchar)
8382 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 }
8384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 if (touched)
8386 return maxchar;
8387 else
8388 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389}
8390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008392fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8395 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8396 const int kind = PyUnicode_KIND(self);
8397 void *data = PyUnicode_DATA(self);
8398 int touched = 0;
8399 Py_UCS4 maxchar = 0;
8400 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 for(i = 0; i < len; ++i) {
8403 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8404 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8405 if (lo != ch) {
8406 if (lo > maxchar)
8407 maxchar = lo;
8408 PyUnicode_WRITE(kind, data, i, lo);
8409 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411 else if (ch > maxchar)
8412 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413 }
8414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 if (touched)
8416 return maxchar;
8417 else
8418 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419}
8420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008421static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008422fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8425 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8426 const int kind = PyUnicode_KIND(self);
8427 void *data = PyUnicode_DATA(self);
8428 int touched = 0;
8429 Py_UCS4 maxchar = 0;
8430 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 for(i = 0; i < len; ++i) {
8433 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8434 Py_UCS4 nu = 0;
8435
8436 if (Py_UNICODE_ISUPPER(ch))
8437 nu = Py_UNICODE_TOLOWER(ch);
8438 else if (Py_UNICODE_ISLOWER(ch))
8439 nu = Py_UNICODE_TOUPPER(ch);
8440
8441 if (nu != 0) {
8442 if (nu > maxchar)
8443 maxchar = nu;
8444 PyUnicode_WRITE(kind, data, i, nu);
8445 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 else if (ch > maxchar)
8448 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 }
8450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451 if (touched)
8452 return maxchar;
8453 else
8454 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455}
8456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008458fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8461 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8462 const int kind = PyUnicode_KIND(self);
8463 void *data = PyUnicode_DATA(self);
8464 int touched = 0;
8465 Py_UCS4 maxchar = 0;
8466 Py_ssize_t i = 0;
8467 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008468
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008469 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471
8472 ch = PyUnicode_READ(kind, data, i);
8473 if (!Py_UNICODE_ISUPPER(ch)) {
8474 maxchar = Py_UNICODE_TOUPPER(ch);
8475 PyUnicode_WRITE(kind, data, i, maxchar);
8476 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 ++i;
8479 for(; i < len; ++i) {
8480 ch = PyUnicode_READ(kind, data, i);
8481 if (!Py_UNICODE_ISLOWER(ch)) {
8482 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8483 if (lo > maxchar)
8484 maxchar = lo;
8485 PyUnicode_WRITE(kind, data, i, lo);
8486 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008488 else if (ch > maxchar)
8489 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008490 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491
8492 if (touched)
8493 return maxchar;
8494 else
8495 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496}
8497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008499fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8502 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8503 const int kind = PyUnicode_KIND(self);
8504 void *data = PyUnicode_DATA(self);
8505 Py_UCS4 maxchar = 0;
8506 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 int previous_is_cased;
8508
8509 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510 if (len == 1) {
8511 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8512 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8513 if (ti != ch) {
8514 PyUnicode_WRITE(kind, data, i, ti);
8515 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 }
8517 else
8518 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 for(; i < len; ++i) {
8522 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8523 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008524
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 nu = Py_UNICODE_TOTITLE(ch);
8529
8530 if (nu > maxchar)
8531 maxchar = nu;
8532 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008533
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 if (Py_UNICODE_ISLOWER(ch) ||
8535 Py_UNICODE_ISUPPER(ch) ||
8536 Py_UNICODE_ISTITLE(ch))
8537 previous_is_cased = 1;
8538 else
8539 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542}
8543
Tim Peters8ce9f162004-08-27 01:49:32 +00008544PyObject *
8545PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008548 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008550 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008551 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8552 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008553 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 Py_ssize_t sz, i, res_offset;
8555 Py_UCS4 maxchar = 0;
8556 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557
Tim Peters05eba1f2004-08-27 21:32:02 +00008558 fseq = PySequence_Fast(seq, "");
8559 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008560 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008561 }
8562
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008563 /* NOTE: the following code can't call back into Python code,
8564 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008565 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008566
Tim Peters05eba1f2004-08-27 21:32:02 +00008567 seqlen = PySequence_Fast_GET_SIZE(fseq);
8568 /* If empty sequence, return u"". */
8569 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008571 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008572 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008573 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008574 /* If singleton sequence with an exact Unicode, return that. */
8575 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 item = items[0];
8577 if (PyUnicode_CheckExact(item)) {
8578 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 goto Done;
8581 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008582 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008583 else {
8584 /* Set up sep and seplen */
8585 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586 /* fall back to a blank space separator */
8587 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008588 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008590 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008591 else {
8592 if (!PyUnicode_Check(separator)) {
8593 PyErr_Format(PyExc_TypeError,
8594 "separator: expected str instance,"
8595 " %.80s found",
8596 Py_TYPE(separator)->tp_name);
8597 goto onError;
8598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599 if (PyUnicode_READY(separator) == -1)
8600 goto onError;
8601 sep = separator;
8602 seplen = PyUnicode_GET_LENGTH(separator);
8603 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8604 /* inc refcount to keep this code path symetric with the
8605 above case of a blank separator */
8606 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008607 }
8608 }
8609
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008610 /* There are at least two things to join, or else we have a subclass
8611 * of str in the sequence.
8612 * Do a pre-pass to figure out the total amount of space we'll
8613 * need (sz), and see whether all argument are strings.
8614 */
8615 sz = 0;
8616 for (i = 0; i < seqlen; i++) {
8617 const Py_ssize_t old_sz = sz;
8618 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 if (!PyUnicode_Check(item)) {
8620 PyErr_Format(PyExc_TypeError,
8621 "sequence item %zd: expected str instance,"
8622 " %.80s found",
8623 i, Py_TYPE(item)->tp_name);
8624 goto onError;
8625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 if (PyUnicode_READY(item) == -1)
8627 goto onError;
8628 sz += PyUnicode_GET_LENGTH(item);
8629 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8630 if (item_maxchar > maxchar)
8631 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008632 if (i != 0)
8633 sz += seplen;
8634 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8635 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008637 goto onError;
8638 }
8639 }
Tim Petersced69f82003-09-16 20:30:58 +00008640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008642 if (res == NULL)
8643 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008644
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008645 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008647 Py_ssize_t itemlen;
8648 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 /* Copy item, and maybe the separator. */
8651 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008652 if (PyUnicode_CopyCharacters(res, res_offset,
8653 sep, 0, seplen) < 0)
8654 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008657 if (PyUnicode_CopyCharacters(res, res_offset,
8658 item, 0, itemlen) < 0)
8659 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008663
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008665 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 Py_XDECREF(sep);
8667 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008670 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008672 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 return NULL;
8674}
8675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676#define FILL(kind, data, value, start, length) \
8677 do { \
8678 Py_ssize_t i_ = 0; \
8679 assert(kind != PyUnicode_WCHAR_KIND); \
8680 switch ((kind)) { \
8681 case PyUnicode_1BYTE_KIND: { \
8682 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8683 memset(to_, (unsigned char)value, length); \
8684 break; \
8685 } \
8686 case PyUnicode_2BYTE_KIND: { \
8687 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8688 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8689 break; \
8690 } \
8691 default: { \
8692 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8693 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8694 break; \
8695 } \
8696 } \
8697 } while (0)
8698
Alexander Belopolsky40018472011-02-26 01:02:56 +00008699static PyUnicodeObject *
8700pad(PyUnicodeObject *self,
8701 Py_ssize_t left,
8702 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008705 PyObject *u;
8706 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008707 int kind;
8708 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709
8710 if (left < 0)
8711 left = 0;
8712 if (right < 0)
8713 right = 0;
8714
Tim Peters7a29bd52001-09-12 03:03:31 +00008715 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 Py_INCREF(self);
8717 return self;
8718 }
8719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8721 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008722 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8723 return NULL;
8724 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8726 if (fill > maxchar)
8727 maxchar = fill;
8728 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008729 if (!u)
8730 return NULL;
8731
8732 kind = PyUnicode_KIND(u);
8733 data = PyUnicode_DATA(u);
8734 if (left)
8735 FILL(kind, data, fill, 0, left);
8736 if (right)
8737 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008738 if (PyUnicode_CopyCharacters(u, left,
8739 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008740 _PyUnicode_LENGTH(self)) < 0)
8741 {
8742 Py_DECREF(u);
8743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744 }
8745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749
Alexander Belopolsky40018472011-02-26 01:02:56 +00008750PyObject *
8751PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754
8755 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 switch(PyUnicode_KIND(string)) {
8760 case PyUnicode_1BYTE_KIND:
8761 list = ucs1lib_splitlines(
8762 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8763 PyUnicode_GET_LENGTH(string), keepends);
8764 break;
8765 case PyUnicode_2BYTE_KIND:
8766 list = ucs2lib_splitlines(
8767 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8768 PyUnicode_GET_LENGTH(string), keepends);
8769 break;
8770 case PyUnicode_4BYTE_KIND:
8771 list = ucs4lib_splitlines(
8772 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8773 PyUnicode_GET_LENGTH(string), keepends);
8774 break;
8775 default:
8776 assert(0);
8777 list = 0;
8778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779 Py_DECREF(string);
8780 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781}
8782
Alexander Belopolsky40018472011-02-26 01:02:56 +00008783static PyObject *
8784split(PyUnicodeObject *self,
8785 PyUnicodeObject *substring,
8786 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788 int kind1, kind2, kind;
8789 void *buf1, *buf2;
8790 Py_ssize_t len1, len2;
8791 PyObject* out;
8792
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008794 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 if (PyUnicode_READY(self) == -1)
8797 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008799 if (substring == NULL)
8800 switch(PyUnicode_KIND(self)) {
8801 case PyUnicode_1BYTE_KIND:
8802 return ucs1lib_split_whitespace(
8803 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8804 PyUnicode_GET_LENGTH(self), maxcount
8805 );
8806 case PyUnicode_2BYTE_KIND:
8807 return ucs2lib_split_whitespace(
8808 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8809 PyUnicode_GET_LENGTH(self), maxcount
8810 );
8811 case PyUnicode_4BYTE_KIND:
8812 return ucs4lib_split_whitespace(
8813 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8814 PyUnicode_GET_LENGTH(self), maxcount
8815 );
8816 default:
8817 assert(0);
8818 return NULL;
8819 }
8820
8821 if (PyUnicode_READY(substring) == -1)
8822 return NULL;
8823
8824 kind1 = PyUnicode_KIND(self);
8825 kind2 = PyUnicode_KIND(substring);
8826 kind = kind1 > kind2 ? kind1 : kind2;
8827 buf1 = PyUnicode_DATA(self);
8828 buf2 = PyUnicode_DATA(substring);
8829 if (kind1 != kind)
8830 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8831 if (!buf1)
8832 return NULL;
8833 if (kind2 != kind)
8834 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8835 if (!buf2) {
8836 if (kind1 != kind) PyMem_Free(buf1);
8837 return NULL;
8838 }
8839 len1 = PyUnicode_GET_LENGTH(self);
8840 len2 = PyUnicode_GET_LENGTH(substring);
8841
8842 switch(kind) {
8843 case PyUnicode_1BYTE_KIND:
8844 out = ucs1lib_split(
8845 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8846 break;
8847 case PyUnicode_2BYTE_KIND:
8848 out = ucs2lib_split(
8849 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8850 break;
8851 case PyUnicode_4BYTE_KIND:
8852 out = ucs4lib_split(
8853 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8854 break;
8855 default:
8856 out = NULL;
8857 }
8858 if (kind1 != kind)
8859 PyMem_Free(buf1);
8860 if (kind2 != kind)
8861 PyMem_Free(buf2);
8862 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863}
8864
Alexander Belopolsky40018472011-02-26 01:02:56 +00008865static PyObject *
8866rsplit(PyUnicodeObject *self,
8867 PyUnicodeObject *substring,
8868 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008869{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008870 int kind1, kind2, kind;
8871 void *buf1, *buf2;
8872 Py_ssize_t len1, len2;
8873 PyObject* out;
8874
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008875 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008876 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 if (PyUnicode_READY(self) == -1)
8879 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 if (substring == NULL)
8882 switch(PyUnicode_KIND(self)) {
8883 case PyUnicode_1BYTE_KIND:
8884 return ucs1lib_rsplit_whitespace(
8885 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8886 PyUnicode_GET_LENGTH(self), maxcount
8887 );
8888 case PyUnicode_2BYTE_KIND:
8889 return ucs2lib_rsplit_whitespace(
8890 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8891 PyUnicode_GET_LENGTH(self), maxcount
8892 );
8893 case PyUnicode_4BYTE_KIND:
8894 return ucs4lib_rsplit_whitespace(
8895 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8896 PyUnicode_GET_LENGTH(self), maxcount
8897 );
8898 default:
8899 assert(0);
8900 return NULL;
8901 }
8902
8903 if (PyUnicode_READY(substring) == -1)
8904 return NULL;
8905
8906 kind1 = PyUnicode_KIND(self);
8907 kind2 = PyUnicode_KIND(substring);
8908 kind = kind1 > kind2 ? kind1 : kind2;
8909 buf1 = PyUnicode_DATA(self);
8910 buf2 = PyUnicode_DATA(substring);
8911 if (kind1 != kind)
8912 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8913 if (!buf1)
8914 return NULL;
8915 if (kind2 != kind)
8916 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8917 if (!buf2) {
8918 if (kind1 != kind) PyMem_Free(buf1);
8919 return NULL;
8920 }
8921 len1 = PyUnicode_GET_LENGTH(self);
8922 len2 = PyUnicode_GET_LENGTH(substring);
8923
8924 switch(kind) {
8925 case PyUnicode_1BYTE_KIND:
8926 out = ucs1lib_rsplit(
8927 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8928 break;
8929 case PyUnicode_2BYTE_KIND:
8930 out = ucs2lib_rsplit(
8931 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8932 break;
8933 case PyUnicode_4BYTE_KIND:
8934 out = ucs4lib_rsplit(
8935 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8936 break;
8937 default:
8938 out = NULL;
8939 }
8940 if (kind1 != kind)
8941 PyMem_Free(buf1);
8942 if (kind2 != kind)
8943 PyMem_Free(buf2);
8944 return out;
8945}
8946
8947static Py_ssize_t
8948anylib_find(int kind, void *buf1, Py_ssize_t len1,
8949 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8950{
8951 switch(kind) {
8952 case PyUnicode_1BYTE_KIND:
8953 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8954 case PyUnicode_2BYTE_KIND:
8955 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8956 case PyUnicode_4BYTE_KIND:
8957 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8958 }
8959 assert(0);
8960 return -1;
8961}
8962
8963static Py_ssize_t
8964anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8965 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8966{
8967 switch(kind) {
8968 case PyUnicode_1BYTE_KIND:
8969 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8970 case PyUnicode_2BYTE_KIND:
8971 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8972 case PyUnicode_4BYTE_KIND:
8973 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8974 }
8975 assert(0);
8976 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008977}
8978
Alexander Belopolsky40018472011-02-26 01:02:56 +00008979static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980replace(PyObject *self, PyObject *str1,
8981 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983 PyObject *u;
8984 char *sbuf = PyUnicode_DATA(self);
8985 char *buf1 = PyUnicode_DATA(str1);
8986 char *buf2 = PyUnicode_DATA(str2);
8987 int srelease = 0, release1 = 0, release2 = 0;
8988 int skind = PyUnicode_KIND(self);
8989 int kind1 = PyUnicode_KIND(str1);
8990 int kind2 = PyUnicode_KIND(str2);
8991 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8992 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8993 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994
8995 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008996 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008998 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 if (skind < kind1)
9001 /* substring too wide to be present */
9002 goto nothing;
9003
9004 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009005 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009006 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009008 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009010 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 Py_UCS4 u1, u2, maxchar;
9012 int mayshrink, rkind;
9013 u1 = PyUnicode_READ_CHAR(str1, 0);
9014 if (!findchar(sbuf, PyUnicode_KIND(self),
9015 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009016 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 u2 = PyUnicode_READ_CHAR(str2, 0);
9018 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9019 /* Replacing u1 with u2 may cause a maxchar reduction in the
9020 result string. */
9021 mayshrink = maxchar > 127;
9022 if (u2 > maxchar) {
9023 maxchar = u2;
9024 mayshrink = 0;
9025 }
9026 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009027 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009029 if (PyUnicode_CopyCharacters(u, 0,
9030 (PyObject*)self, 0, slen) < 0)
9031 {
9032 Py_DECREF(u);
9033 return NULL;
9034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035 rkind = PyUnicode_KIND(u);
9036 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9037 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009038 if (--maxcount < 0)
9039 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 if (mayshrink) {
9043 PyObject *tmp = u;
9044 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9045 PyUnicode_GET_LENGTH(tmp));
9046 Py_DECREF(tmp);
9047 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 int rkind = skind;
9050 char *res;
9051 if (kind1 < rkind) {
9052 /* widen substring */
9053 buf1 = _PyUnicode_AsKind(str1, rkind);
9054 if (!buf1) goto error;
9055 release1 = 1;
9056 }
9057 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009058 if (i < 0)
9059 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 if (rkind > kind2) {
9061 /* widen replacement */
9062 buf2 = _PyUnicode_AsKind(str2, rkind);
9063 if (!buf2) goto error;
9064 release2 = 1;
9065 }
9066 else if (rkind < kind2) {
9067 /* widen self and buf1 */
9068 rkind = kind2;
9069 if (release1) PyMem_Free(buf1);
9070 sbuf = _PyUnicode_AsKind(self, rkind);
9071 if (!sbuf) goto error;
9072 srelease = 1;
9073 buf1 = _PyUnicode_AsKind(str1, rkind);
9074 if (!buf1) goto error;
9075 release1 = 1;
9076 }
9077 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9078 if (!res) {
9079 PyErr_NoMemory();
9080 goto error;
9081 }
9082 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009083 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9085 buf2,
9086 PyUnicode_KIND_SIZE(rkind, len2));
9087 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009088
9089 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9091 slen-i,
9092 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009093 if (i == -1)
9094 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9096 buf2,
9097 PyUnicode_KIND_SIZE(rkind, len2));
9098 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009099 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100
9101 u = PyUnicode_FromKindAndData(rkind, res, slen);
9102 PyMem_Free(res);
9103 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009105 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107 Py_ssize_t n, i, j, ires;
9108 Py_ssize_t product, new_size;
9109 int rkind = skind;
9110 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 if (kind1 < rkind) {
9113 buf1 = _PyUnicode_AsKind(str1, rkind);
9114 if (!buf1) goto error;
9115 release1 = 1;
9116 }
9117 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009118 if (n == 0)
9119 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 if (kind2 < rkind) {
9121 buf2 = _PyUnicode_AsKind(str2, rkind);
9122 if (!buf2) goto error;
9123 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125 else if (kind2 > rkind) {
9126 rkind = kind2;
9127 sbuf = _PyUnicode_AsKind(self, rkind);
9128 if (!sbuf) goto error;
9129 srelease = 1;
9130 if (release1) PyMem_Free(buf1);
9131 buf1 = _PyUnicode_AsKind(str1, rkind);
9132 if (!buf1) goto error;
9133 release1 = 1;
9134 }
9135 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9136 PyUnicode_GET_LENGTH(str1))); */
9137 product = n * (len2-len1);
9138 if ((product / (len2-len1)) != n) {
9139 PyErr_SetString(PyExc_OverflowError,
9140 "replace string is too long");
9141 goto error;
9142 }
9143 new_size = slen + product;
9144 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9145 PyErr_SetString(PyExc_OverflowError,
9146 "replace string is too long");
9147 goto error;
9148 }
9149 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9150 if (!res)
9151 goto error;
9152 ires = i = 0;
9153 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009154 while (n-- > 0) {
9155 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 j = anylib_find(rkind,
9157 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9158 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009159 if (j == -1)
9160 break;
9161 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009162 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9164 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9165 PyUnicode_KIND_SIZE(rkind, j-i));
9166 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009167 }
9168 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009169 if (len2 > 0) {
9170 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9171 buf2,
9172 PyUnicode_KIND_SIZE(rkind, len2));
9173 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009174 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009177 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009178 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9180 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9181 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009182 } else {
9183 /* interleave */
9184 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9186 buf2,
9187 PyUnicode_KIND_SIZE(rkind, len2));
9188 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009189 if (--n <= 0)
9190 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9192 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9193 PyUnicode_KIND_SIZE(rkind, 1));
9194 ires++;
9195 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9198 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9199 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009201 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009202 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009203 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009204 if (srelease)
9205 PyMem_FREE(sbuf);
9206 if (release1)
9207 PyMem_FREE(buf1);
9208 if (release2)
9209 PyMem_FREE(buf2);
9210 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009211
Benjamin Peterson29060642009-01-31 22:14:21 +00009212 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009213 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 if (srelease)
9215 PyMem_FREE(sbuf);
9216 if (release1)
9217 PyMem_FREE(buf1);
9218 if (release2)
9219 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009220 if (PyUnicode_CheckExact(self)) {
9221 Py_INCREF(self);
9222 return (PyObject *) self;
9223 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009224 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 error:
9226 if (srelease && sbuf)
9227 PyMem_FREE(sbuf);
9228 if (release1 && buf1)
9229 PyMem_FREE(buf1);
9230 if (release2 && buf2)
9231 PyMem_FREE(buf2);
9232 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233}
9234
9235/* --- Unicode Object Methods --------------------------------------------- */
9236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009237PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239\n\
9240Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009241characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009242
9243static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009244unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009245{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246 return fixup(self, fixtitle);
9247}
9248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009249PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009250 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009251\n\
9252Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009253have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254
9255static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009256unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009257{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258 return fixup(self, fixcapitalize);
9259}
9260
9261#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009262PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009263 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264\n\
9265Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009266normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009267
9268static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009269unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270{
9271 PyObject *list;
9272 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009273 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009274
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275 /* Split into words */
9276 list = split(self, NULL, -1);
9277 if (!list)
9278 return NULL;
9279
9280 /* Capitalize each word */
9281 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9282 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009283 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284 if (item == NULL)
9285 goto onError;
9286 Py_DECREF(PyList_GET_ITEM(list, i));
9287 PyList_SET_ITEM(list, i, item);
9288 }
9289
9290 /* Join the words to form a new string */
9291 item = PyUnicode_Join(NULL, list);
9292
Benjamin Peterson29060642009-01-31 22:14:21 +00009293 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294 Py_DECREF(list);
9295 return (PyObject *)item;
9296}
9297#endif
9298
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009299/* Argument converter. Coerces to a single unicode character */
9300
9301static int
9302convert_uc(PyObject *obj, void *addr)
9303{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009305 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009306
Benjamin Peterson14339b62009-01-31 16:36:08 +00009307 uniobj = PyUnicode_FromObject(obj);
9308 if (uniobj == NULL) {
9309 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009310 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009311 return 0;
9312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009314 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009315 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009316 Py_DECREF(uniobj);
9317 return 0;
9318 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009320 Py_DECREF(uniobj);
9321 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009322}
9323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009324PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009325 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009327Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009328done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329
9330static PyObject *
9331unicode_center(PyUnicodeObject *self, PyObject *args)
9332{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009333 Py_ssize_t marg, left;
9334 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009335 Py_UCS4 fillchar = ' ';
9336
Victor Stinnere9a29352011-10-01 02:14:59 +02009337 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339
Victor Stinnere9a29352011-10-01 02:14:59 +02009340 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341 return NULL;
9342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344 Py_INCREF(self);
9345 return (PyObject*) self;
9346 }
9347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349 left = marg / 2 + (marg & width & 1);
9350
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009351 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352}
9353
Marc-André Lemburge5034372000-08-08 08:04:29 +00009354#if 0
9355
9356/* This code should go into some future Unicode collation support
9357 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009358 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009359
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009360/* speedy UTF-16 code point order comparison */
9361/* gleaned from: */
9362/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9363
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009364static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009365{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009366 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009367 0, 0, 0, 0, 0, 0, 0, 0,
9368 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009369 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009370};
9371
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372static int
9373unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9374{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009375 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009376
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 Py_UNICODE *s1 = str1->str;
9378 Py_UNICODE *s2 = str2->str;
9379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380 len1 = str1->_base._base.length;
9381 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009382
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009384 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009385
9386 c1 = *s1++;
9387 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009388
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 if (c1 > (1<<11) * 26)
9390 c1 += utf16Fixup[c1>>11];
9391 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009392 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009393 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009394
9395 if (c1 != c2)
9396 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009397
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009398 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399 }
9400
9401 return (len1 < len2) ? -1 : (len1 != len2);
9402}
9403
Marc-André Lemburge5034372000-08-08 08:04:29 +00009404#else
9405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406/* This function assumes that str1 and str2 are readied by the caller. */
9407
Marc-André Lemburge5034372000-08-08 08:04:29 +00009408static int
9409unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 int kind1, kind2;
9412 void *data1, *data2;
9413 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 kind1 = PyUnicode_KIND(str1);
9416 kind2 = PyUnicode_KIND(str2);
9417 data1 = PyUnicode_DATA(str1);
9418 data2 = PyUnicode_DATA(str2);
9419 len1 = PyUnicode_GET_LENGTH(str1);
9420 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422 for (i = 0; i < len1 && i < len2; ++i) {
9423 Py_UCS4 c1, c2;
9424 c1 = PyUnicode_READ(kind1, data1, i);
9425 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009426
9427 if (c1 != c2)
9428 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009429 }
9430
9431 return (len1 < len2) ? -1 : (len1 != len2);
9432}
9433
9434#endif
9435
Alexander Belopolsky40018472011-02-26 01:02:56 +00009436int
9437PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009438{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9440 if (PyUnicode_READY(left) == -1 ||
9441 PyUnicode_READY(right) == -1)
9442 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009443 return unicode_compare((PyUnicodeObject *)left,
9444 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009446 PyErr_Format(PyExc_TypeError,
9447 "Can't compare %.100s and %.100s",
9448 left->ob_type->tp_name,
9449 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009450 return -1;
9451}
9452
Martin v. Löwis5b222132007-06-10 09:51:05 +00009453int
9454PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9455{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 Py_ssize_t i;
9457 int kind;
9458 void *data;
9459 Py_UCS4 chr;
9460
Martin v. Löwis5b222132007-06-10 09:51:05 +00009461 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 if (PyUnicode_READY(uni) == -1)
9463 return -1;
9464 kind = PyUnicode_KIND(uni);
9465 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009466 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9468 if (chr != str[i])
9469 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009470 /* This check keeps Python strings that end in '\0' from comparing equal
9471 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009474 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009476 return 0;
9477}
9478
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009479
Benjamin Peterson29060642009-01-31 22:14:21 +00009480#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009481 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009482
Alexander Belopolsky40018472011-02-26 01:02:56 +00009483PyObject *
9484PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009485{
9486 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009487
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009488 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9489 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 if (PyUnicode_READY(left) == -1 ||
9491 PyUnicode_READY(right) == -1)
9492 return NULL;
9493 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9494 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009495 if (op == Py_EQ) {
9496 Py_INCREF(Py_False);
9497 return Py_False;
9498 }
9499 if (op == Py_NE) {
9500 Py_INCREF(Py_True);
9501 return Py_True;
9502 }
9503 }
9504 if (left == right)
9505 result = 0;
9506 else
9507 result = unicode_compare((PyUnicodeObject *)left,
9508 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009509
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009510 /* Convert the return value to a Boolean */
9511 switch (op) {
9512 case Py_EQ:
9513 v = TEST_COND(result == 0);
9514 break;
9515 case Py_NE:
9516 v = TEST_COND(result != 0);
9517 break;
9518 case Py_LE:
9519 v = TEST_COND(result <= 0);
9520 break;
9521 case Py_GE:
9522 v = TEST_COND(result >= 0);
9523 break;
9524 case Py_LT:
9525 v = TEST_COND(result == -1);
9526 break;
9527 case Py_GT:
9528 v = TEST_COND(result == 1);
9529 break;
9530 default:
9531 PyErr_BadArgument();
9532 return NULL;
9533 }
9534 Py_INCREF(v);
9535 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009536 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009537
Brian Curtindfc80e32011-08-10 20:28:54 -05009538 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009539}
9540
Alexander Belopolsky40018472011-02-26 01:02:56 +00009541int
9542PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009543{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009544 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 int kind1, kind2, kind;
9546 void *buf1, *buf2;
9547 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009548 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009549
9550 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009551 sub = PyUnicode_FromObject(element);
9552 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009553 PyErr_Format(PyExc_TypeError,
9554 "'in <string>' requires string as left operand, not %s",
9555 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009556 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 if (PyUnicode_READY(sub) == -1)
9559 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009560
Thomas Wouters477c8d52006-05-27 19:21:47 +00009561 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009562 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009563 Py_DECREF(sub);
9564 return -1;
9565 }
9566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 kind1 = PyUnicode_KIND(str);
9568 kind2 = PyUnicode_KIND(sub);
9569 kind = kind1 > kind2 ? kind1 : kind2;
9570 buf1 = PyUnicode_DATA(str);
9571 buf2 = PyUnicode_DATA(sub);
9572 if (kind1 != kind)
9573 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9574 if (!buf1) {
9575 Py_DECREF(sub);
9576 return -1;
9577 }
9578 if (kind2 != kind)
9579 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9580 if (!buf2) {
9581 Py_DECREF(sub);
9582 if (kind1 != kind) PyMem_Free(buf1);
9583 return -1;
9584 }
9585 len1 = PyUnicode_GET_LENGTH(str);
9586 len2 = PyUnicode_GET_LENGTH(sub);
9587
9588 switch(kind) {
9589 case PyUnicode_1BYTE_KIND:
9590 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9591 break;
9592 case PyUnicode_2BYTE_KIND:
9593 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9594 break;
9595 case PyUnicode_4BYTE_KIND:
9596 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9597 break;
9598 default:
9599 result = -1;
9600 assert(0);
9601 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009602
9603 Py_DECREF(str);
9604 Py_DECREF(sub);
9605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 if (kind1 != kind)
9607 PyMem_Free(buf1);
9608 if (kind2 != kind)
9609 PyMem_Free(buf2);
9610
Guido van Rossum403d68b2000-03-13 15:55:09 +00009611 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009612}
9613
Guido van Rossumd57fd912000-03-10 22:53:23 +00009614/* Concat to string or Unicode object giving a new Unicode object. */
9615
Alexander Belopolsky40018472011-02-26 01:02:56 +00009616PyObject *
9617PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009618{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 PyObject *u = NULL, *v = NULL, *w;
9620 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009621
9622 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009625 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629
9630 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009631 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009632 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009635 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009636 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 }
9639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009641 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 w = PyUnicode_New(
9645 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9646 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009648 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009649 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9650 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009651 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009652 v, 0,
9653 PyUnicode_GET_LENGTH(v)) < 0)
9654 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655 Py_DECREF(u);
9656 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658
Benjamin Peterson29060642009-01-31 22:14:21 +00009659 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660 Py_XDECREF(u);
9661 Py_XDECREF(v);
9662 return NULL;
9663}
9664
Walter Dörwald1ab83302007-05-18 17:15:44 +00009665void
9666PyUnicode_Append(PyObject **pleft, PyObject *right)
9667{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009668 PyObject *new;
9669 if (*pleft == NULL)
9670 return;
9671 if (right == NULL || !PyUnicode_Check(*pleft)) {
9672 Py_DECREF(*pleft);
9673 *pleft = NULL;
9674 return;
9675 }
9676 new = PyUnicode_Concat(*pleft, right);
9677 Py_DECREF(*pleft);
9678 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009679}
9680
9681void
9682PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9683{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009684 PyUnicode_Append(pleft, right);
9685 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009686}
9687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009688PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009689 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009690\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009691Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009692string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009693interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694
9695static PyObject *
9696unicode_count(PyUnicodeObject *self, PyObject *args)
9697{
9698 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009699 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009700 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009701 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 int kind1, kind2, kind;
9703 void *buf1, *buf2;
9704 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705
Jesus Ceaac451502011-04-20 17:09:23 +02009706 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9707 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009708 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 kind1 = PyUnicode_KIND(self);
9711 kind2 = PyUnicode_KIND(substring);
9712 kind = kind1 > kind2 ? kind1 : kind2;
9713 buf1 = PyUnicode_DATA(self);
9714 buf2 = PyUnicode_DATA(substring);
9715 if (kind1 != kind)
9716 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9717 if (!buf1) {
9718 Py_DECREF(substring);
9719 return NULL;
9720 }
9721 if (kind2 != kind)
9722 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9723 if (!buf2) {
9724 Py_DECREF(substring);
9725 if (kind1 != kind) PyMem_Free(buf1);
9726 return NULL;
9727 }
9728 len1 = PyUnicode_GET_LENGTH(self);
9729 len2 = PyUnicode_GET_LENGTH(substring);
9730
9731 ADJUST_INDICES(start, end, len1);
9732 switch(kind) {
9733 case PyUnicode_1BYTE_KIND:
9734 iresult = ucs1lib_count(
9735 ((Py_UCS1*)buf1) + start, end - start,
9736 buf2, len2, PY_SSIZE_T_MAX
9737 );
9738 break;
9739 case PyUnicode_2BYTE_KIND:
9740 iresult = ucs2lib_count(
9741 ((Py_UCS2*)buf1) + start, end - start,
9742 buf2, len2, PY_SSIZE_T_MAX
9743 );
9744 break;
9745 case PyUnicode_4BYTE_KIND:
9746 iresult = ucs4lib_count(
9747 ((Py_UCS4*)buf1) + start, end - start,
9748 buf2, len2, PY_SSIZE_T_MAX
9749 );
9750 break;
9751 default:
9752 assert(0); iresult = 0;
9753 }
9754
9755 result = PyLong_FromSsize_t(iresult);
9756
9757 if (kind1 != kind)
9758 PyMem_Free(buf1);
9759 if (kind2 != kind)
9760 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761
9762 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009763
Guido van Rossumd57fd912000-03-10 22:53:23 +00009764 return result;
9765}
9766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009767PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009768 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009770Encode S using the codec registered for encoding. Default encoding\n\
9771is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009772handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009773a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9774'xmlcharrefreplace' as well as any other name registered with\n\
9775codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776
9777static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009778unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009780 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009781 char *encoding = NULL;
9782 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009783
Benjamin Peterson308d6372009-09-18 21:42:35 +00009784 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9785 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009786 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009787 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009788}
9789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009790PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009791 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009792\n\
9793Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009794If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795
9796static PyObject*
9797unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9798{
9799 Py_UNICODE *e;
9800 Py_UNICODE *p;
9801 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009802 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009803 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804 PyUnicodeObject *u;
9805 int tabsize = 8;
9806
9807 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9811 return NULL;
9812
Thomas Wouters7e474022000-07-16 12:04:32 +00009813 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009814 i = 0; /* chars up to and including most recent \n or \r */
9815 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9817 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009819 if (tabsize > 0) {
9820 incr = tabsize - (j % tabsize); /* cannot overflow */
9821 if (j > PY_SSIZE_T_MAX - incr)
9822 goto overflow1;
9823 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009824 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009827 if (j > PY_SSIZE_T_MAX - 1)
9828 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829 j++;
9830 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009831 if (i > PY_SSIZE_T_MAX - j)
9832 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009833 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009834 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835 }
9836 }
9837
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009838 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009839 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009840
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841 /* Second pass: create output string and fill it */
9842 u = _PyUnicode_New(i + j);
9843 if (!u)
9844 return NULL;
9845
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009846 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 q = _PyUnicode_WSTR(u); /* next output char */
9848 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009852 if (tabsize > 0) {
9853 i = tabsize - (j % tabsize);
9854 j += i;
9855 while (i--) {
9856 if (q >= qe)
9857 goto overflow2;
9858 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009859 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009860 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009861 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009862 else {
9863 if (q >= qe)
9864 goto overflow2;
9865 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009866 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867 if (*p == '\n' || *p == '\r')
9868 j = 0;
9869 }
9870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 if (PyUnicode_READY(u) == -1) {
9872 Py_DECREF(u);
9873 return NULL;
9874 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009876
9877 overflow2:
9878 Py_DECREF(u);
9879 overflow1:
9880 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9881 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882}
9883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009884PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009885 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886\n\
9887Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009888such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889arguments start and end are interpreted as in slice notation.\n\
9890\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009891Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892
9893static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895{
Jesus Ceaac451502011-04-20 17:09:23 +02009896 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009897 Py_ssize_t start;
9898 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009899 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900
Jesus Ceaac451502011-04-20 17:09:23 +02009901 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9902 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 if (PyUnicode_READY(self) == -1)
9906 return NULL;
9907 if (PyUnicode_READY(substring) == -1)
9908 return NULL;
9909
9910 result = any_find_slice(
9911 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9912 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009913 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914
9915 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 if (result == -2)
9918 return NULL;
9919
Christian Heimes217cfd12007-12-02 14:31:20 +00009920 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921}
9922
9923static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009924unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009926 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
9927 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930}
9931
Guido van Rossumc2504932007-09-18 19:42:40 +00009932/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009933 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009934static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009935unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936{
Guido van Rossumc2504932007-09-18 19:42:40 +00009937 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009938 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 if (_PyUnicode_HASH(self) != -1)
9941 return _PyUnicode_HASH(self);
9942 if (PyUnicode_READY(self) == -1)
9943 return -1;
9944 len = PyUnicode_GET_LENGTH(self);
9945
9946 /* The hash function as a macro, gets expanded three times below. */
9947#define HASH(P) \
9948 x = (Py_uhash_t)*P << 7; \
9949 while (--len >= 0) \
9950 x = (1000003*x) ^ (Py_uhash_t)*P++;
9951
9952 switch (PyUnicode_KIND(self)) {
9953 case PyUnicode_1BYTE_KIND: {
9954 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9955 HASH(c);
9956 break;
9957 }
9958 case PyUnicode_2BYTE_KIND: {
9959 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9960 HASH(s);
9961 break;
9962 }
9963 default: {
9964 Py_UCS4 *l;
9965 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9966 "Impossible switch case in unicode_hash");
9967 l = PyUnicode_4BYTE_DATA(self);
9968 HASH(l);
9969 break;
9970 }
9971 }
9972 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9973
Guido van Rossumc2504932007-09-18 19:42:40 +00009974 if (x == -1)
9975 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009977 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009981PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009982 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009983\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009984Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985
9986static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009988{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009989 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009990 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009991 Py_ssize_t start;
9992 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993
Jesus Ceaac451502011-04-20 17:09:23 +02009994 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9995 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 if (PyUnicode_READY(self) == -1)
9999 return NULL;
10000 if (PyUnicode_READY(substring) == -1)
10001 return NULL;
10002
10003 result = any_find_slice(
10004 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10005 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010006 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007
10008 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 if (result == -2)
10011 return NULL;
10012
Guido van Rossumd57fd912000-03-10 22:53:23 +000010013 if (result < 0) {
10014 PyErr_SetString(PyExc_ValueError, "substring not found");
10015 return NULL;
10016 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010017
Christian Heimes217cfd12007-12-02 14:31:20 +000010018 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019}
10020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010021PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010022 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010024Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010025at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026
10027static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010028unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 Py_ssize_t i, length;
10031 int kind;
10032 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033 int cased;
10034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 if (PyUnicode_READY(self) == -1)
10036 return NULL;
10037 length = PyUnicode_GET_LENGTH(self);
10038 kind = PyUnicode_KIND(self);
10039 data = PyUnicode_DATA(self);
10040
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 if (length == 1)
10043 return PyBool_FromLong(
10044 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010046 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010048 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010049
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 for (i = 0; i < length; i++) {
10052 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010053
Benjamin Peterson29060642009-01-31 22:14:21 +000010054 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10055 return PyBool_FromLong(0);
10056 else if (!cased && Py_UNICODE_ISLOWER(ch))
10057 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010059 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060}
10061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010062PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010063 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010064\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010065Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010066at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010067
10068static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010069unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 Py_ssize_t i, length;
10072 int kind;
10073 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010074 int cased;
10075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 if (PyUnicode_READY(self) == -1)
10077 return NULL;
10078 length = PyUnicode_GET_LENGTH(self);
10079 kind = PyUnicode_KIND(self);
10080 data = PyUnicode_DATA(self);
10081
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 if (length == 1)
10084 return PyBool_FromLong(
10085 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010086
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010087 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010089 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010090
Guido van Rossumd57fd912000-03-10 22:53:23 +000010091 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 for (i = 0; i < length; i++) {
10093 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010094
Benjamin Peterson29060642009-01-31 22:14:21 +000010095 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10096 return PyBool_FromLong(0);
10097 else if (!cased && Py_UNICODE_ISUPPER(ch))
10098 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010099 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010100 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010101}
10102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010103PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010104 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010105\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010106Return True if S is a titlecased string and there is at least one\n\
10107character in S, i.e. upper- and titlecase characters may only\n\
10108follow uncased characters and lowercase characters only cased ones.\n\
10109Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110
10111static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010112unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010113{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 Py_ssize_t i, length;
10115 int kind;
10116 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117 int cased, previous_is_cased;
10118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 if (PyUnicode_READY(self) == -1)
10120 return NULL;
10121 length = PyUnicode_GET_LENGTH(self);
10122 kind = PyUnicode_KIND(self);
10123 data = PyUnicode_DATA(self);
10124
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 if (length == 1) {
10127 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10128 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10129 (Py_UNICODE_ISUPPER(ch) != 0));
10130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010132 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010134 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010135
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136 cased = 0;
10137 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 for (i = 0; i < length; i++) {
10139 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010140
Benjamin Peterson29060642009-01-31 22:14:21 +000010141 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10142 if (previous_is_cased)
10143 return PyBool_FromLong(0);
10144 previous_is_cased = 1;
10145 cased = 1;
10146 }
10147 else if (Py_UNICODE_ISLOWER(ch)) {
10148 if (!previous_is_cased)
10149 return PyBool_FromLong(0);
10150 previous_is_cased = 1;
10151 cased = 1;
10152 }
10153 else
10154 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010156 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157}
10158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010159PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010160 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010162Return True if all characters in S are whitespace\n\
10163and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164
10165static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010166unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 Py_ssize_t i, length;
10169 int kind;
10170 void *data;
10171
10172 if (PyUnicode_READY(self) == -1)
10173 return NULL;
10174 length = PyUnicode_GET_LENGTH(self);
10175 kind = PyUnicode_KIND(self);
10176 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 if (length == 1)
10180 return PyBool_FromLong(
10181 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010183 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010185 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 for (i = 0; i < length; i++) {
10188 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010189 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010190 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010191 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010192 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193}
10194
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010195PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010196 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010197\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010198Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010199and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010200
10201static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010202unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 Py_ssize_t i, length;
10205 int kind;
10206 void *data;
10207
10208 if (PyUnicode_READY(self) == -1)
10209 return NULL;
10210 length = PyUnicode_GET_LENGTH(self);
10211 kind = PyUnicode_KIND(self);
10212 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010213
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010214 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 if (length == 1)
10216 return PyBool_FromLong(
10217 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010218
10219 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010221 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 for (i = 0; i < length; i++) {
10224 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010225 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010226 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010227 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010228}
10229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010230PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010231 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010232\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010233Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010234and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010235
10236static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010237unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 int kind;
10240 void *data;
10241 Py_ssize_t len, i;
10242
10243 if (PyUnicode_READY(self) == -1)
10244 return NULL;
10245
10246 kind = PyUnicode_KIND(self);
10247 data = PyUnicode_DATA(self);
10248 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010249
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010250 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 if (len == 1) {
10252 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10253 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10254 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010255
10256 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010258 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 for (i = 0; i < len; i++) {
10261 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010262 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010263 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010264 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010265 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010266}
10267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010268PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010269 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010271Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010272False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010273
10274static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010275unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 Py_ssize_t i, length;
10278 int kind;
10279 void *data;
10280
10281 if (PyUnicode_READY(self) == -1)
10282 return NULL;
10283 length = PyUnicode_GET_LENGTH(self);
10284 kind = PyUnicode_KIND(self);
10285 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 if (length == 1)
10289 return PyBool_FromLong(
10290 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010292 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010294 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 for (i = 0; i < length; i++) {
10297 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010298 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010300 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301}
10302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010303PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010304 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010306Return True if all characters in S are digits\n\
10307and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308
10309static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010310unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 Py_ssize_t i, length;
10313 int kind;
10314 void *data;
10315
10316 if (PyUnicode_READY(self) == -1)
10317 return NULL;
10318 length = PyUnicode_GET_LENGTH(self);
10319 kind = PyUnicode_KIND(self);
10320 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 if (length == 1) {
10324 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10325 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10326 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010328 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010330 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 for (i = 0; i < length; i++) {
10333 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010334 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010336 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010337}
10338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010339PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010340 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010342Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010343False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344
10345static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010346unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 Py_ssize_t i, length;
10349 int kind;
10350 void *data;
10351
10352 if (PyUnicode_READY(self) == -1)
10353 return NULL;
10354 length = PyUnicode_GET_LENGTH(self);
10355 kind = PyUnicode_KIND(self);
10356 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 if (length == 1)
10360 return PyBool_FromLong(
10361 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010363 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010365 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 for (i = 0; i < length; i++) {
10368 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010369 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010371 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372}
10373
Martin v. Löwis47383402007-08-15 07:32:56 +000010374int
10375PyUnicode_IsIdentifier(PyObject *self)
10376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 int kind;
10378 void *data;
10379 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010380 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 if (PyUnicode_READY(self) == -1) {
10383 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010384 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 }
10386
10387 /* Special case for empty strings */
10388 if (PyUnicode_GET_LENGTH(self) == 0)
10389 return 0;
10390 kind = PyUnicode_KIND(self);
10391 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010392
10393 /* PEP 3131 says that the first character must be in
10394 XID_Start and subsequent characters in XID_Continue,
10395 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010396 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010397 letters, digits, underscore). However, given the current
10398 definition of XID_Start and XID_Continue, it is sufficient
10399 to check just for these, except that _ must be allowed
10400 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010402 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010403 return 0;
10404
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010405 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010407 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010408 return 1;
10409}
10410
10411PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010412 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010413\n\
10414Return True if S is a valid identifier according\n\
10415to the language definition.");
10416
10417static PyObject*
10418unicode_isidentifier(PyObject *self)
10419{
10420 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10421}
10422
Georg Brandl559e5d72008-06-11 18:37:52 +000010423PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010425\n\
10426Return True if all characters in S are considered\n\
10427printable in repr() or S is empty, False otherwise.");
10428
10429static PyObject*
10430unicode_isprintable(PyObject *self)
10431{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 Py_ssize_t i, length;
10433 int kind;
10434 void *data;
10435
10436 if (PyUnicode_READY(self) == -1)
10437 return NULL;
10438 length = PyUnicode_GET_LENGTH(self);
10439 kind = PyUnicode_KIND(self);
10440 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010441
10442 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 if (length == 1)
10444 return PyBool_FromLong(
10445 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 for (i = 0; i < length; i++) {
10448 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010449 Py_RETURN_FALSE;
10450 }
10451 }
10452 Py_RETURN_TRUE;
10453}
10454
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010455PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010456 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010457\n\
10458Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010459iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010460
10461static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010462unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010463{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010464 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010465}
10466
Martin v. Löwis18e16552006-02-15 17:27:45 +000010467static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010468unicode_length(PyUnicodeObject *self)
10469{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 if (PyUnicode_READY(self) == -1)
10471 return -1;
10472 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473}
10474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010475PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010476 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010478Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010479done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480
10481static PyObject *
10482unicode_ljust(PyUnicodeObject *self, PyObject *args)
10483{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010484 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 Py_UCS4 fillchar = ' ';
10486
10487 if (PyUnicode_READY(self) == -1)
10488 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010489
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010490 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491 return NULL;
10492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494 Py_INCREF(self);
10495 return (PyObject*) self;
10496 }
10497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499}
10500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010501PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010502 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010504Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505
10506static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010507unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509 return fixup(self, fixlower);
10510}
10511
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010512#define LEFTSTRIP 0
10513#define RIGHTSTRIP 1
10514#define BOTHSTRIP 2
10515
10516/* Arrays indexed by above */
10517static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10518
10519#define STRIPNAME(i) (stripformat[i]+3)
10520
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010521/* externally visible for str.strip(unicode) */
10522PyObject *
10523_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10524{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 void *data;
10526 int kind;
10527 Py_ssize_t i, j, len;
10528 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10531 return NULL;
10532
10533 kind = PyUnicode_KIND(self);
10534 data = PyUnicode_DATA(self);
10535 len = PyUnicode_GET_LENGTH(self);
10536 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10537 PyUnicode_DATA(sepobj),
10538 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010539
Benjamin Peterson14339b62009-01-31 16:36:08 +000010540 i = 0;
10541 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 while (i < len &&
10543 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010544 i++;
10545 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010546 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010547
Benjamin Peterson14339b62009-01-31 16:36:08 +000010548 j = len;
10549 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010550 do {
10551 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 } while (j >= i &&
10553 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010554 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010555 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010556
Victor Stinner12bab6d2011-10-01 01:53:49 +020010557 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558}
10559
10560PyObject*
10561PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10562{
10563 unsigned char *data;
10564 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010565 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566
Victor Stinnerde636f32011-10-01 03:55:54 +020010567 if (PyUnicode_READY(self) == -1)
10568 return NULL;
10569
10570 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10571
Victor Stinner12bab6d2011-10-01 01:53:49 +020010572 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010574 if (PyUnicode_CheckExact(self)) {
10575 Py_INCREF(self);
10576 return self;
10577 }
10578 else
10579 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 }
10581
Victor Stinner12bab6d2011-10-01 01:53:49 +020010582 length = end - start;
10583 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010584 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585
Victor Stinnerde636f32011-10-01 03:55:54 +020010586 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010587 PyErr_SetString(PyExc_IndexError, "string index out of range");
10588 return NULL;
10589 }
10590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 kind = PyUnicode_KIND(self);
10592 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010593 return PyUnicode_FromKindAndData(kind,
10594 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010595 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597
10598static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010599do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 int kind;
10602 void *data;
10603 Py_ssize_t len, i, j;
10604
10605 if (PyUnicode_READY(self) == -1)
10606 return NULL;
10607
10608 kind = PyUnicode_KIND(self);
10609 data = PyUnicode_DATA(self);
10610 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010611
Benjamin Peterson14339b62009-01-31 16:36:08 +000010612 i = 0;
10613 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010615 i++;
10616 }
10617 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010618
Benjamin Peterson14339b62009-01-31 16:36:08 +000010619 j = len;
10620 if (striptype != LEFTSTRIP) {
10621 do {
10622 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010624 j++;
10625 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010626
Victor Stinner12bab6d2011-10-01 01:53:49 +020010627 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628}
10629
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010630
10631static PyObject *
10632do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10633{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010634 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010635
Benjamin Peterson14339b62009-01-31 16:36:08 +000010636 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10637 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010638
Benjamin Peterson14339b62009-01-31 16:36:08 +000010639 if (sep != NULL && sep != Py_None) {
10640 if (PyUnicode_Check(sep))
10641 return _PyUnicode_XStrip(self, striptype, sep);
10642 else {
10643 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010644 "%s arg must be None or str",
10645 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010646 return NULL;
10647 }
10648 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010649
Benjamin Peterson14339b62009-01-31 16:36:08 +000010650 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010651}
10652
10653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010654PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010655 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010656\n\
10657Return a copy of the string S with leading and trailing\n\
10658whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010659If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010660
10661static PyObject *
10662unicode_strip(PyUnicodeObject *self, PyObject *args)
10663{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010664 if (PyTuple_GET_SIZE(args) == 0)
10665 return do_strip(self, BOTHSTRIP); /* Common case */
10666 else
10667 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010668}
10669
10670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010671PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010672 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010673\n\
10674Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010675If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010676
10677static PyObject *
10678unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10679{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010680 if (PyTuple_GET_SIZE(args) == 0)
10681 return do_strip(self, LEFTSTRIP); /* Common case */
10682 else
10683 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010684}
10685
10686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010687PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010688 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010689\n\
10690Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010691If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010692
10693static PyObject *
10694unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10695{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010696 if (PyTuple_GET_SIZE(args) == 0)
10697 return do_strip(self, RIGHTSTRIP); /* Common case */
10698 else
10699 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010700}
10701
10702
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010704unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705{
10706 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708
Georg Brandl222de0f2009-04-12 12:01:50 +000010709 if (len < 1) {
10710 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020010711 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000010712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713
Tim Peters7a29bd52001-09-12 03:03:31 +000010714 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715 /* no repeat, return original string */
10716 Py_INCREF(str);
10717 return (PyObject*) str;
10718 }
Tim Peters8f422462000-09-09 06:13:41 +000010719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 if (PyUnicode_READY(str) == -1)
10721 return NULL;
10722
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010723 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010724 PyErr_SetString(PyExc_OverflowError,
10725 "repeated string is too long");
10726 return NULL;
10727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731 if (!u)
10732 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010733 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 if (PyUnicode_GET_LENGTH(str) == 1) {
10736 const int kind = PyUnicode_KIND(str);
10737 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10738 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010739 if (kind == PyUnicode_1BYTE_KIND)
10740 memset(to, (unsigned char)fill_char, len);
10741 else {
10742 for (n = 0; n < len; ++n)
10743 PyUnicode_WRITE(kind, to, n, fill_char);
10744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 }
10746 else {
10747 /* number of characters copied this far */
10748 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10749 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10750 char *to = (char *) PyUnicode_DATA(u);
10751 Py_MEMCPY(to, PyUnicode_DATA(str),
10752 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010753 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 n = (done <= nchars-done) ? done : nchars-done;
10755 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010756 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010757 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758 }
10759
10760 return (PyObject*) u;
10761}
10762
Alexander Belopolsky40018472011-02-26 01:02:56 +000010763PyObject *
10764PyUnicode_Replace(PyObject *obj,
10765 PyObject *subobj,
10766 PyObject *replobj,
10767 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768{
10769 PyObject *self;
10770 PyObject *str1;
10771 PyObject *str2;
10772 PyObject *result;
10773
10774 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010775 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010778 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010779 Py_DECREF(self);
10780 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 }
10782 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010783 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010784 Py_DECREF(self);
10785 Py_DECREF(str1);
10786 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789 Py_DECREF(self);
10790 Py_DECREF(str1);
10791 Py_DECREF(str2);
10792 return result;
10793}
10794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010795PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010796 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797\n\
10798Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010799old replaced by new. If the optional argument count is\n\
10800given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801
10802static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 PyObject *str1;
10806 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010807 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808 PyObject *result;
10809
Martin v. Löwis18e16552006-02-15 17:27:45 +000010810 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010813 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 str1 = PyUnicode_FromObject(str1);
10815 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10816 return NULL;
10817 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020010818 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010819 Py_DECREF(str1);
10820 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822
10823 result = replace(self, str1, str2, maxcount);
10824
10825 Py_DECREF(str1);
10826 Py_DECREF(str2);
10827 return result;
10828}
10829
Alexander Belopolsky40018472011-02-26 01:02:56 +000010830static PyObject *
10831unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010833 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 Py_ssize_t isize;
10835 Py_ssize_t osize, squote, dquote, i, o;
10836 Py_UCS4 max, quote;
10837 int ikind, okind;
10838 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010841 return NULL;
10842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 isize = PyUnicode_GET_LENGTH(unicode);
10844 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 /* Compute length of output, quote characters, and
10847 maximum character */
10848 osize = 2; /* quotes */
10849 max = 127;
10850 squote = dquote = 0;
10851 ikind = PyUnicode_KIND(unicode);
10852 for (i = 0; i < isize; i++) {
10853 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10854 switch (ch) {
10855 case '\'': squote++; osize++; break;
10856 case '"': dquote++; osize++; break;
10857 case '\\': case '\t': case '\r': case '\n':
10858 osize += 2; break;
10859 default:
10860 /* Fast-path ASCII */
10861 if (ch < ' ' || ch == 0x7f)
10862 osize += 4; /* \xHH */
10863 else if (ch < 0x7f)
10864 osize++;
10865 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10866 osize++;
10867 max = ch > max ? ch : max;
10868 }
10869 else if (ch < 0x100)
10870 osize += 4; /* \xHH */
10871 else if (ch < 0x10000)
10872 osize += 6; /* \uHHHH */
10873 else
10874 osize += 10; /* \uHHHHHHHH */
10875 }
10876 }
10877
10878 quote = '\'';
10879 if (squote) {
10880 if (dquote)
10881 /* Both squote and dquote present. Use squote,
10882 and escape them */
10883 osize += squote;
10884 else
10885 quote = '"';
10886 }
10887
10888 repr = PyUnicode_New(osize, max);
10889 if (repr == NULL)
10890 return NULL;
10891 okind = PyUnicode_KIND(repr);
10892 odata = PyUnicode_DATA(repr);
10893
10894 PyUnicode_WRITE(okind, odata, 0, quote);
10895 PyUnicode_WRITE(okind, odata, osize-1, quote);
10896
10897 for (i = 0, o = 1; i < isize; i++) {
10898 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010899
10900 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010901 if ((ch == quote) || (ch == '\\')) {
10902 PyUnicode_WRITE(okind, odata, o++, '\\');
10903 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010904 continue;
10905 }
10906
Benjamin Peterson29060642009-01-31 22:14:21 +000010907 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010908 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909 PyUnicode_WRITE(okind, odata, o++, '\\');
10910 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010911 }
10912 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010913 PyUnicode_WRITE(okind, odata, o++, '\\');
10914 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010915 }
10916 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917 PyUnicode_WRITE(okind, odata, o++, '\\');
10918 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010919 }
10920
10921 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010922 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010923 PyUnicode_WRITE(okind, odata, o++, '\\');
10924 PyUnicode_WRITE(okind, odata, o++, 'x');
10925 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10926 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010927 }
10928
Georg Brandl559e5d72008-06-11 18:37:52 +000010929 /* Copy ASCII characters as-is */
10930 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010932 }
10933
Benjamin Peterson29060642009-01-31 22:14:21 +000010934 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010935 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010936 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010937 (categories Z* and C* except ASCII space)
10938 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010940 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 if (ch <= 0xff) {
10942 PyUnicode_WRITE(okind, odata, o++, '\\');
10943 PyUnicode_WRITE(okind, odata, o++, 'x');
10944 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10945 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010946 }
10947 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948 else if (ch >= 0x10000) {
10949 PyUnicode_WRITE(okind, odata, o++, '\\');
10950 PyUnicode_WRITE(okind, odata, o++, 'U');
10951 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10952 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10953 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10954 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10955 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10956 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10957 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10958 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010959 }
10960 /* Map 16-bit characters to '\uxxxx' */
10961 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 PyUnicode_WRITE(okind, odata, o++, '\\');
10963 PyUnicode_WRITE(okind, odata, o++, 'u');
10964 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10965 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10966 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10967 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010968 }
10969 }
10970 /* Copy characters as-is */
10971 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010973 }
10974 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010975 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010977 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978}
10979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010980PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010981 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982\n\
10983Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010984such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985arguments start and end are interpreted as in slice notation.\n\
10986\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010987Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988
10989static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991{
Jesus Ceaac451502011-04-20 17:09:23 +020010992 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010993 Py_ssize_t start;
10994 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010995 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996
Jesus Ceaac451502011-04-20 17:09:23 +020010997 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10998 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010999 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 if (PyUnicode_READY(self) == -1)
11002 return NULL;
11003 if (PyUnicode_READY(substring) == -1)
11004 return NULL;
11005
11006 result = any_find_slice(
11007 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11008 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011009 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010
11011 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 if (result == -2)
11014 return NULL;
11015
Christian Heimes217cfd12007-12-02 14:31:20 +000011016 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017}
11018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011019PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011022Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023
11024static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026{
Jesus Ceaac451502011-04-20 17:09:23 +020011027 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011028 Py_ssize_t start;
11029 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011030 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031
Jesus Ceaac451502011-04-20 17:09:23 +020011032 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11033 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 if (PyUnicode_READY(self) == -1)
11037 return NULL;
11038 if (PyUnicode_READY(substring) == -1)
11039 return NULL;
11040
11041 result = any_find_slice(
11042 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11043 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011044 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045
11046 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048 if (result == -2)
11049 return NULL;
11050
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051 if (result < 0) {
11052 PyErr_SetString(PyExc_ValueError, "substring not found");
11053 return NULL;
11054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055
Christian Heimes217cfd12007-12-02 14:31:20 +000011056 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057}
11058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011059PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011060 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011062Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011063done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
11065static PyObject *
11066unicode_rjust(PyUnicodeObject *self, PyObject *args)
11067{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011068 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069 Py_UCS4 fillchar = ' ';
11070
Victor Stinnere9a29352011-10-01 02:14:59 +020011071 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011073
Victor Stinnere9a29352011-10-01 02:14:59 +020011074 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075 return NULL;
11076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078 Py_INCREF(self);
11079 return (PyObject*) self;
11080 }
11081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011083}
11084
Alexander Belopolsky40018472011-02-26 01:02:56 +000011085PyObject *
11086PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087{
11088 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011089
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090 s = PyUnicode_FromObject(s);
11091 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011092 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011093 if (sep != NULL) {
11094 sep = PyUnicode_FromObject(sep);
11095 if (sep == NULL) {
11096 Py_DECREF(s);
11097 return NULL;
11098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099 }
11100
11101 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11102
11103 Py_DECREF(s);
11104 Py_XDECREF(sep);
11105 return result;
11106}
11107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011108PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011109 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110\n\
11111Return a list of the words in S, using sep as the\n\
11112delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011113splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011114whitespace string is a separator and empty strings are\n\
11115removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116
11117static PyObject*
11118unicode_split(PyUnicodeObject *self, PyObject *args)
11119{
11120 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011121 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122
Martin v. Löwis18e16552006-02-15 17:27:45 +000011123 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124 return NULL;
11125
11126 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011127 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011129 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011131 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132}
11133
Thomas Wouters477c8d52006-05-27 19:21:47 +000011134PyObject *
11135PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11136{
11137 PyObject* str_obj;
11138 PyObject* sep_obj;
11139 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140 int kind1, kind2, kind;
11141 void *buf1 = NULL, *buf2 = NULL;
11142 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011143
11144 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011145 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011147 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011149 Py_DECREF(str_obj);
11150 return NULL;
11151 }
11152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 kind1 = PyUnicode_KIND(str_in);
11154 kind2 = PyUnicode_KIND(sep_obj);
11155 kind = kind1 > kind2 ? kind1 : kind2;
11156 buf1 = PyUnicode_DATA(str_in);
11157 if (kind1 != kind)
11158 buf1 = _PyUnicode_AsKind(str_in, kind);
11159 if (!buf1)
11160 goto onError;
11161 buf2 = PyUnicode_DATA(sep_obj);
11162 if (kind2 != kind)
11163 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11164 if (!buf2)
11165 goto onError;
11166 len1 = PyUnicode_GET_LENGTH(str_obj);
11167 len2 = PyUnicode_GET_LENGTH(sep_obj);
11168
11169 switch(PyUnicode_KIND(str_in)) {
11170 case PyUnicode_1BYTE_KIND:
11171 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11172 break;
11173 case PyUnicode_2BYTE_KIND:
11174 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11175 break;
11176 case PyUnicode_4BYTE_KIND:
11177 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11178 break;
11179 default:
11180 assert(0);
11181 out = 0;
11182 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011183
11184 Py_DECREF(sep_obj);
11185 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 if (kind1 != kind)
11187 PyMem_Free(buf1);
11188 if (kind2 != kind)
11189 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011190
11191 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 onError:
11193 Py_DECREF(sep_obj);
11194 Py_DECREF(str_obj);
11195 if (kind1 != kind && buf1)
11196 PyMem_Free(buf1);
11197 if (kind2 != kind && buf2)
11198 PyMem_Free(buf2);
11199 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011200}
11201
11202
11203PyObject *
11204PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11205{
11206 PyObject* str_obj;
11207 PyObject* sep_obj;
11208 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 int kind1, kind2, kind;
11210 void *buf1 = NULL, *buf2 = NULL;
11211 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011212
11213 str_obj = PyUnicode_FromObject(str_in);
11214 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011215 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011216 sep_obj = PyUnicode_FromObject(sep_in);
11217 if (!sep_obj) {
11218 Py_DECREF(str_obj);
11219 return NULL;
11220 }
11221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 kind1 = PyUnicode_KIND(str_in);
11223 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011224 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 buf1 = PyUnicode_DATA(str_in);
11226 if (kind1 != kind)
11227 buf1 = _PyUnicode_AsKind(str_in, kind);
11228 if (!buf1)
11229 goto onError;
11230 buf2 = PyUnicode_DATA(sep_obj);
11231 if (kind2 != kind)
11232 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11233 if (!buf2)
11234 goto onError;
11235 len1 = PyUnicode_GET_LENGTH(str_obj);
11236 len2 = PyUnicode_GET_LENGTH(sep_obj);
11237
11238 switch(PyUnicode_KIND(str_in)) {
11239 case PyUnicode_1BYTE_KIND:
11240 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11241 break;
11242 case PyUnicode_2BYTE_KIND:
11243 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11244 break;
11245 case PyUnicode_4BYTE_KIND:
11246 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11247 break;
11248 default:
11249 assert(0);
11250 out = 0;
11251 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011252
11253 Py_DECREF(sep_obj);
11254 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 if (kind1 != kind)
11256 PyMem_Free(buf1);
11257 if (kind2 != kind)
11258 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011259
11260 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 onError:
11262 Py_DECREF(sep_obj);
11263 Py_DECREF(str_obj);
11264 if (kind1 != kind && buf1)
11265 PyMem_Free(buf1);
11266 if (kind2 != kind && buf2)
11267 PyMem_Free(buf2);
11268 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011269}
11270
11271PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011272 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011273\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011274Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011275the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011276found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011277
11278static PyObject*
11279unicode_partition(PyUnicodeObject *self, PyObject *separator)
11280{
11281 return PyUnicode_Partition((PyObject *)self, separator);
11282}
11283
11284PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011285 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011286\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011287Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011288the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011289separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011290
11291static PyObject*
11292unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11293{
11294 return PyUnicode_RPartition((PyObject *)self, separator);
11295}
11296
Alexander Belopolsky40018472011-02-26 01:02:56 +000011297PyObject *
11298PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011299{
11300 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011301
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011302 s = PyUnicode_FromObject(s);
11303 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011304 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011305 if (sep != NULL) {
11306 sep = PyUnicode_FromObject(sep);
11307 if (sep == NULL) {
11308 Py_DECREF(s);
11309 return NULL;
11310 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011311 }
11312
11313 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11314
11315 Py_DECREF(s);
11316 Py_XDECREF(sep);
11317 return result;
11318}
11319
11320PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011321 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011322\n\
11323Return a list of the words in S, using sep as the\n\
11324delimiter string, starting at the end of the string and\n\
11325working to the front. If maxsplit is given, at most maxsplit\n\
11326splits are done. If sep is not specified, any whitespace string\n\
11327is a separator.");
11328
11329static PyObject*
11330unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11331{
11332 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011333 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011334
Martin v. Löwis18e16552006-02-15 17:27:45 +000011335 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011336 return NULL;
11337
11338 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011339 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011340 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011341 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011342 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011343 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011344}
11345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011346PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348\n\
11349Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011350Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011351is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352
11353static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011354unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011356 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011357 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011359 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11360 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361 return NULL;
11362
Guido van Rossum86662912000-04-11 15:38:46 +000011363 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364}
11365
11366static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011367PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368{
Walter Dörwald346737f2007-05-31 10:44:43 +000011369 if (PyUnicode_CheckExact(self)) {
11370 Py_INCREF(self);
11371 return self;
11372 } else
11373 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011374 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375}
11376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011377PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011378 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379\n\
11380Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011381and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382
11383static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011384unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386 return fixup(self, fixswapcase);
11387}
11388
Georg Brandlceee0772007-11-27 23:48:05 +000011389PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011390 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011391\n\
11392Return a translation table usable for str.translate().\n\
11393If there is only one argument, it must be a dictionary mapping Unicode\n\
11394ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011395Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011396If there are two arguments, they must be strings of equal length, and\n\
11397in the resulting dictionary, each character in x will be mapped to the\n\
11398character at the same position in y. If there is a third argument, it\n\
11399must be a string, whose characters will be mapped to None in the result.");
11400
11401static PyObject*
11402unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11403{
11404 PyObject *x, *y = NULL, *z = NULL;
11405 PyObject *new = NULL, *key, *value;
11406 Py_ssize_t i = 0;
11407 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011408
Georg Brandlceee0772007-11-27 23:48:05 +000011409 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11410 return NULL;
11411 new = PyDict_New();
11412 if (!new)
11413 return NULL;
11414 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 int x_kind, y_kind, z_kind;
11416 void *x_data, *y_data, *z_data;
11417
Georg Brandlceee0772007-11-27 23:48:05 +000011418 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011419 if (!PyUnicode_Check(x)) {
11420 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11421 "be a string if there is a second argument");
11422 goto err;
11423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011425 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11426 "arguments must have equal length");
11427 goto err;
11428 }
11429 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 x_kind = PyUnicode_KIND(x);
11431 y_kind = PyUnicode_KIND(y);
11432 x_data = PyUnicode_DATA(x);
11433 y_data = PyUnicode_DATA(y);
11434 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11435 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11436 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011437 if (!key || !value)
11438 goto err;
11439 res = PyDict_SetItem(new, key, value);
11440 Py_DECREF(key);
11441 Py_DECREF(value);
11442 if (res < 0)
11443 goto err;
11444 }
11445 /* create entries for deleting chars in z */
11446 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 z_kind = PyUnicode_KIND(z);
11448 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011449 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011451 if (!key)
11452 goto err;
11453 res = PyDict_SetItem(new, key, Py_None);
11454 Py_DECREF(key);
11455 if (res < 0)
11456 goto err;
11457 }
11458 }
11459 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 int kind;
11461 void *data;
11462
Georg Brandlceee0772007-11-27 23:48:05 +000011463 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011464 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011465 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11466 "to maketrans it must be a dict");
11467 goto err;
11468 }
11469 /* copy entries into the new dict, converting string keys to int keys */
11470 while (PyDict_Next(x, &i, &key, &value)) {
11471 if (PyUnicode_Check(key)) {
11472 /* convert string keys to integer keys */
11473 PyObject *newkey;
11474 if (PyUnicode_GET_SIZE(key) != 1) {
11475 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11476 "table must be of length 1");
11477 goto err;
11478 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011479 kind = PyUnicode_KIND(key);
11480 data = PyUnicode_DATA(key);
11481 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011482 if (!newkey)
11483 goto err;
11484 res = PyDict_SetItem(new, newkey, value);
11485 Py_DECREF(newkey);
11486 if (res < 0)
11487 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011488 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011489 /* just keep integer keys */
11490 if (PyDict_SetItem(new, key, value) < 0)
11491 goto err;
11492 } else {
11493 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11494 "be strings or integers");
11495 goto err;
11496 }
11497 }
11498 }
11499 return new;
11500 err:
11501 Py_DECREF(new);
11502 return NULL;
11503}
11504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011505PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507\n\
11508Return a copy of the string S, where all characters have been mapped\n\
11509through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011510Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011511Unmapped characters are left untouched. Characters mapped to None\n\
11512are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
11514static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518}
11519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011520PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011521 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011523Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524
11525static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011526unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528 return fixup(self, fixupper);
11529}
11530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011531PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011532 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011534Pad a numeric string S with zeros on the left, to fill a field\n\
11535of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
11537static PyObject *
11538unicode_zfill(PyUnicodeObject *self, PyObject *args)
11539{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011540 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011542 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 int kind;
11544 void *data;
11545 Py_UCS4 chr;
11546
11547 if (PyUnicode_READY(self) == -1)
11548 return NULL;
11549
Martin v. Löwis18e16552006-02-15 17:27:45 +000011550 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551 return NULL;
11552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011554 if (PyUnicode_CheckExact(self)) {
11555 Py_INCREF(self);
11556 return (PyObject*) self;
11557 }
11558 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011559 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560 }
11561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563
11564 u = pad(self, fill, 0, '0');
11565
Walter Dörwald068325e2002-04-15 13:36:47 +000011566 if (u == NULL)
11567 return NULL;
11568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 kind = PyUnicode_KIND(u);
11570 data = PyUnicode_DATA(u);
11571 chr = PyUnicode_READ(kind, data, fill);
11572
11573 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 PyUnicode_WRITE(kind, data, 0, chr);
11576 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577 }
11578
11579 return (PyObject*) u;
11580}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581
11582#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011583static PyObject *
11584unicode__decimal2ascii(PyObject *self)
11585{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011587}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588#endif
11589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011590PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011591 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011593Return True if S starts with the specified prefix, False otherwise.\n\
11594With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011595With optional end, stop comparing S at that position.\n\
11596prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597
11598static PyObject *
11599unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011600 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011602 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011604 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011605 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011606 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607
Jesus Ceaac451502011-04-20 17:09:23 +020011608 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011609 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011610 if (PyTuple_Check(subobj)) {
11611 Py_ssize_t i;
11612 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11613 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011614 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011615 if (substring == NULL)
11616 return NULL;
11617 result = tailmatch(self, substring, start, end, -1);
11618 Py_DECREF(substring);
11619 if (result) {
11620 Py_RETURN_TRUE;
11621 }
11622 }
11623 /* nothing matched */
11624 Py_RETURN_FALSE;
11625 }
11626 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011627 if (substring == NULL) {
11628 if (PyErr_ExceptionMatches(PyExc_TypeError))
11629 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11630 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011631 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011632 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011633 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011635 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636}
11637
11638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011639PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011640 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011642Return True if S ends with the specified suffix, False otherwise.\n\
11643With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011644With optional end, stop comparing S at that position.\n\
11645suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646
11647static PyObject *
11648unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011651 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011653 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011654 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011655 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656
Jesus Ceaac451502011-04-20 17:09:23 +020011657 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011658 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011659 if (PyTuple_Check(subobj)) {
11660 Py_ssize_t i;
11661 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11662 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011663 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011664 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011665 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011666 result = tailmatch(self, substring, start, end, +1);
11667 Py_DECREF(substring);
11668 if (result) {
11669 Py_RETURN_TRUE;
11670 }
11671 }
11672 Py_RETURN_FALSE;
11673 }
11674 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011675 if (substring == NULL) {
11676 if (PyErr_ExceptionMatches(PyExc_TypeError))
11677 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11678 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011679 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011680 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011681 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011683 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684}
11685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011687
11688PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011689 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011690\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011691Return a formatted version of S, using substitutions from args and kwargs.\n\
11692The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011693
Eric Smith27bbca62010-11-04 17:06:58 +000011694PyDoc_STRVAR(format_map__doc__,
11695 "S.format_map(mapping) -> str\n\
11696\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011697Return a formatted version of S, using substitutions from mapping.\n\
11698The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011699
Eric Smith4a7d76d2008-05-30 18:10:19 +000011700static PyObject *
11701unicode__format__(PyObject* self, PyObject* args)
11702{
11703 PyObject *format_spec;
11704
11705 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11706 return NULL;
11707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11709 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011710}
11711
Eric Smith8c663262007-08-25 02:26:07 +000011712PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011713 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011714\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011715Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011716
11717static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011718unicode__sizeof__(PyUnicodeObject *v)
11719{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 Py_ssize_t size;
11721
11722 /* If it's a compact object, account for base structure +
11723 character data. */
11724 if (PyUnicode_IS_COMPACT_ASCII(v))
11725 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11726 else if (PyUnicode_IS_COMPACT(v))
11727 size = sizeof(PyCompactUnicodeObject) +
11728 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11729 else {
11730 /* If it is a two-block object, account for base object, and
11731 for character block if present. */
11732 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020011733 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 size += (PyUnicode_GET_LENGTH(v) + 1) *
11735 PyUnicode_CHARACTER_SIZE(v);
11736 }
11737 /* If the wstr pointer is present, account for it unless it is shared
11738 with the data pointer. Since PyUnicode_DATA will crash if the object
11739 is not ready, check whether it's either not ready (in which case the
11740 data is entirely in wstr) or if the data is not shared. */
11741 if (_PyUnicode_WSTR(v) &&
11742 (!PyUnicode_IS_READY(v) ||
11743 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11744 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020011745 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011746 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747
11748 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011749}
11750
11751PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011752 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011753
11754static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011755unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011756{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011757 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 if (!copy)
11759 return NULL;
11760 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011761}
11762
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763static PyMethodDef unicode_methods[] = {
11764
11765 /* Order is according to common usage: often used methods should
11766 appear first, since lookup is done sequentially. */
11767
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011768 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011769 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11770 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011771 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011772 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11773 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11774 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11775 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11776 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11777 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11778 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011779 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011780 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11781 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11782 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011783 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011784 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11785 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11786 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011787 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011788 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011789 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011790 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011791 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11792 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11793 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11794 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11795 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11796 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11797 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11798 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11799 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11800 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11801 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11802 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11803 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11804 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011805 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011806 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011807 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011808 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011809 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011810 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011811 {"maketrans", (PyCFunction) unicode_maketrans,
11812 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011813 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011814#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011815 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816#endif
11817
11818#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011819 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011820 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821#endif
11822
Benjamin Peterson14339b62009-01-31 16:36:08 +000011823 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 {NULL, NULL}
11825};
11826
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011827static PyObject *
11828unicode_mod(PyObject *v, PyObject *w)
11829{
Brian Curtindfc80e32011-08-10 20:28:54 -050011830 if (!PyUnicode_Check(v))
11831 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011832 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011833}
11834
11835static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011836 0, /*nb_add*/
11837 0, /*nb_subtract*/
11838 0, /*nb_multiply*/
11839 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011840};
11841
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011843 (lenfunc) unicode_length, /* sq_length */
11844 PyUnicode_Concat, /* sq_concat */
11845 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11846 (ssizeargfunc) unicode_getitem, /* sq_item */
11847 0, /* sq_slice */
11848 0, /* sq_ass_item */
11849 0, /* sq_ass_slice */
11850 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851};
11852
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011853static PyObject*
11854unicode_subscript(PyUnicodeObject* self, PyObject* item)
11855{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 if (PyUnicode_READY(self) == -1)
11857 return NULL;
11858
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011859 if (PyIndex_Check(item)) {
11860 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011861 if (i == -1 && PyErr_Occurred())
11862 return NULL;
11863 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011865 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011866 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011867 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011869 Py_UNICODE* result_buf;
11870 PyObject* result;
11871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011873 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011874 return NULL;
11875 }
11876
11877 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 return PyUnicode_New(0, 0);
11879 } else if (start == 0 && step == 1 &&
11880 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011881 PyUnicode_CheckExact(self)) {
11882 Py_INCREF(self);
11883 return (PyObject *)self;
11884 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011885 return PyUnicode_Substring((PyObject*)self,
11886 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011887 } else {
11888 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011889 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11890 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011891
Benjamin Peterson29060642009-01-31 22:14:21 +000011892 if (result_buf == NULL)
11893 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011894
11895 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11896 result_buf[i] = source_buf[cur];
11897 }
Tim Petersced69f82003-09-16 20:30:58 +000011898
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011899 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011900 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011901 return result;
11902 }
11903 } else {
11904 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11905 return NULL;
11906 }
11907}
11908
11909static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011910 (lenfunc)unicode_length, /* mp_length */
11911 (binaryfunc)unicode_subscript, /* mp_subscript */
11912 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011913};
11914
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916/* Helpers for PyUnicode_Format() */
11917
11918static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011919getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011921 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011923 (*p_argidx)++;
11924 if (arglen < 0)
11925 return args;
11926 else
11927 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928 }
11929 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931 return NULL;
11932}
11933
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011934/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011936static PyObject *
11937formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011939 char *p;
11940 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011942
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 x = PyFloat_AsDouble(v);
11944 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011945 return NULL;
11946
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011948 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011949
Eric Smith0923d1d2009-04-16 20:16:10 +000011950 p = PyOS_double_to_string(x, type, prec,
11951 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011952 if (p == NULL)
11953 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011955 PyMem_Free(p);
11956 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957}
11958
Tim Peters38fd5b62000-09-21 05:43:11 +000011959static PyObject*
11960formatlong(PyObject *val, int flags, int prec, int type)
11961{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011962 char *buf;
11963 int len;
11964 PyObject *str; /* temporary string object. */
11965 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011966
Benjamin Peterson14339b62009-01-31 16:36:08 +000011967 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11968 if (!str)
11969 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011971 Py_DECREF(str);
11972 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011973}
11974
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011977 size_t buflen,
11978 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011980 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011981 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 if (PyUnicode_GET_LENGTH(v) == 1) {
11983 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011984 buf[1] = '\0';
11985 return 1;
11986 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011987 goto onError;
11988 }
11989 else {
11990 /* Integer input truncated to a character */
11991 long x;
11992 x = PyLong_AsLong(v);
11993 if (x == -1 && PyErr_Occurred())
11994 goto onError;
11995
11996 if (x < 0 || x > 0x10ffff) {
11997 PyErr_SetString(PyExc_OverflowError,
11998 "%c arg not in range(0x110000)");
11999 return -1;
12000 }
12001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012003 buf[1] = '\0';
12004 return 1;
12005 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012006
Benjamin Peterson29060642009-01-31 22:14:21 +000012007 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012008 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012010 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011}
12012
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012013/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012014 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012015*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012016#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012017
Alexander Belopolsky40018472011-02-26 01:02:56 +000012018PyObject *
12019PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 void *fmt;
12022 int fmtkind;
12023 PyObject *result;
12024 Py_UCS4 *res, *res0;
12025 Py_UCS4 max;
12026 int kind;
12027 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012031
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012033 PyErr_BadInternalCall();
12034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12037 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012038 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 fmt = PyUnicode_DATA(uformat);
12040 fmtkind = PyUnicode_KIND(uformat);
12041 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12042 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043
12044 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12046 if (res0 == NULL) {
12047 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012048 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050
12051 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012052 arglen = PyTuple_Size(args);
12053 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054 }
12055 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012056 arglen = -1;
12057 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012059 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012060 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012061 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062
12063 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012065 if (--rescnt < 0) {
12066 rescnt = fmtcnt + 100;
12067 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12069 if (res0 == NULL){
12070 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012071 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 }
12073 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012074 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012077 }
12078 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012079 /* Got a format specifier */
12080 int flags = 0;
12081 Py_ssize_t width = -1;
12082 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 Py_UCS4 c = '\0';
12084 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012085 int isnumok;
12086 PyObject *v = NULL;
12087 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 void *pbuf;
12089 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012090 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 Py_ssize_t len, len1;
12092 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012094 fmtpos++;
12095 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12096 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012097 Py_ssize_t keylen;
12098 PyObject *key;
12099 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012100
Benjamin Peterson29060642009-01-31 22:14:21 +000012101 if (dict == NULL) {
12102 PyErr_SetString(PyExc_TypeError,
12103 "format requires a mapping");
12104 goto onError;
12105 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012107 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 /* Skip over balanced parentheses */
12110 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012118 if (fmtcnt < 0 || pcount > 0) {
12119 PyErr_SetString(PyExc_ValueError,
12120 "incomplete format key");
12121 goto onError;
12122 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012123 key = PyUnicode_Substring((PyObject*)uformat,
12124 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012125 if (key == NULL)
12126 goto onError;
12127 if (args_owned) {
12128 Py_DECREF(args);
12129 args_owned = 0;
12130 }
12131 args = PyObject_GetItem(dict, key);
12132 Py_DECREF(key);
12133 if (args == NULL) {
12134 goto onError;
12135 }
12136 args_owned = 1;
12137 arglen = -1;
12138 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012139 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 case '-': flags |= F_LJUST; continue;
12143 case '+': flags |= F_SIGN; continue;
12144 case ' ': flags |= F_BLANK; continue;
12145 case '#': flags |= F_ALT; continue;
12146 case '0': flags |= F_ZERO; continue;
12147 }
12148 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012149 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012150 if (c == '*') {
12151 v = getnextarg(args, arglen, &argidx);
12152 if (v == NULL)
12153 goto onError;
12154 if (!PyLong_Check(v)) {
12155 PyErr_SetString(PyExc_TypeError,
12156 "* wants int");
12157 goto onError;
12158 }
12159 width = PyLong_AsLong(v);
12160 if (width == -1 && PyErr_Occurred())
12161 goto onError;
12162 if (width < 0) {
12163 flags |= F_LJUST;
12164 width = -width;
12165 }
12166 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012168 }
12169 else if (c >= '0' && c <= '9') {
12170 width = c - '0';
12171 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012173 if (c < '0' || c > '9')
12174 break;
12175 if ((width*10) / 10 != width) {
12176 PyErr_SetString(PyExc_ValueError,
12177 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012178 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012179 }
12180 width = width*10 + (c - '0');
12181 }
12182 }
12183 if (c == '.') {
12184 prec = 0;
12185 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012187 if (c == '*') {
12188 v = getnextarg(args, arglen, &argidx);
12189 if (v == NULL)
12190 goto onError;
12191 if (!PyLong_Check(v)) {
12192 PyErr_SetString(PyExc_TypeError,
12193 "* wants int");
12194 goto onError;
12195 }
12196 prec = PyLong_AsLong(v);
12197 if (prec == -1 && PyErr_Occurred())
12198 goto onError;
12199 if (prec < 0)
12200 prec = 0;
12201 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012203 }
12204 else if (c >= '0' && c <= '9') {
12205 prec = c - '0';
12206 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012208 if (c < '0' || c > '9')
12209 break;
12210 if ((prec*10) / 10 != prec) {
12211 PyErr_SetString(PyExc_ValueError,
12212 "prec too big");
12213 goto onError;
12214 }
12215 prec = prec*10 + (c - '0');
12216 }
12217 }
12218 } /* prec */
12219 if (fmtcnt >= 0) {
12220 if (c == 'h' || c == 'l' || c == 'L') {
12221 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012223 }
12224 }
12225 if (fmtcnt < 0) {
12226 PyErr_SetString(PyExc_ValueError,
12227 "incomplete format");
12228 goto onError;
12229 }
12230 if (c != '%') {
12231 v = getnextarg(args, arglen, &argidx);
12232 if (v == NULL)
12233 goto onError;
12234 }
12235 sign = 0;
12236 fill = ' ';
12237 switch (c) {
12238
12239 case '%':
12240 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012241 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012242 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012244 len = 1;
12245 break;
12246
12247 case 's':
12248 case 'r':
12249 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012250 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012251 temp = v;
12252 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012253 }
12254 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012255 if (c == 's')
12256 temp = PyObject_Str(v);
12257 else if (c == 'r')
12258 temp = PyObject_Repr(v);
12259 else
12260 temp = PyObject_ASCII(v);
12261 if (temp == NULL)
12262 goto onError;
12263 if (PyUnicode_Check(temp))
12264 /* nothing to do */;
12265 else {
12266 Py_DECREF(temp);
12267 PyErr_SetString(PyExc_TypeError,
12268 "%s argument has non-string str()");
12269 goto onError;
12270 }
12271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272 if (PyUnicode_READY(temp) == -1) {
12273 Py_CLEAR(temp);
12274 goto onError;
12275 }
12276 pbuf = PyUnicode_DATA(temp);
12277 kind = PyUnicode_KIND(temp);
12278 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 if (prec >= 0 && len > prec)
12280 len = prec;
12281 break;
12282
12283 case 'i':
12284 case 'd':
12285 case 'u':
12286 case 'o':
12287 case 'x':
12288 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012289 isnumok = 0;
12290 if (PyNumber_Check(v)) {
12291 PyObject *iobj=NULL;
12292
12293 if (PyLong_Check(v)) {
12294 iobj = v;
12295 Py_INCREF(iobj);
12296 }
12297 else {
12298 iobj = PyNumber_Long(v);
12299 }
12300 if (iobj!=NULL) {
12301 if (PyLong_Check(iobj)) {
12302 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012303 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 Py_DECREF(iobj);
12305 if (!temp)
12306 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 if (PyUnicode_READY(temp) == -1) {
12308 Py_CLEAR(temp);
12309 goto onError;
12310 }
12311 pbuf = PyUnicode_DATA(temp);
12312 kind = PyUnicode_KIND(temp);
12313 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012314 sign = 1;
12315 }
12316 else {
12317 Py_DECREF(iobj);
12318 }
12319 }
12320 }
12321 if (!isnumok) {
12322 PyErr_Format(PyExc_TypeError,
12323 "%%%c format: a number is required, "
12324 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12325 goto onError;
12326 }
12327 if (flags & F_ZERO)
12328 fill = '0';
12329 break;
12330
12331 case 'e':
12332 case 'E':
12333 case 'f':
12334 case 'F':
12335 case 'g':
12336 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012337 temp = formatfloat(v, flags, prec, c);
12338 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012339 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 if (PyUnicode_READY(temp) == -1) {
12341 Py_CLEAR(temp);
12342 goto onError;
12343 }
12344 pbuf = PyUnicode_DATA(temp);
12345 kind = PyUnicode_KIND(temp);
12346 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012347 sign = 1;
12348 if (flags & F_ZERO)
12349 fill = '0';
12350 break;
12351
12352 case 'c':
12353 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012355 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012356 if (len < 0)
12357 goto onError;
12358 break;
12359
12360 default:
12361 PyErr_Format(PyExc_ValueError,
12362 "unsupported format character '%c' (0x%x) "
12363 "at index %zd",
12364 (31<=c && c<=126) ? (char)c : '?',
12365 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012367 goto onError;
12368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 /* pbuf is initialized here. */
12370 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012371 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12373 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12374 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 len--;
12376 }
12377 else if (flags & F_SIGN)
12378 sign = '+';
12379 else if (flags & F_BLANK)
12380 sign = ' ';
12381 else
12382 sign = 0;
12383 }
12384 if (width < len)
12385 width = len;
12386 if (rescnt - (sign != 0) < width) {
12387 reslen -= rescnt;
12388 rescnt = width + fmtcnt + 100;
12389 reslen += rescnt;
12390 if (reslen < 0) {
12391 Py_XDECREF(temp);
12392 PyErr_NoMemory();
12393 goto onError;
12394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012395 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12396 if (res0 == 0) {
12397 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012398 Py_XDECREF(temp);
12399 goto onError;
12400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012402 }
12403 if (sign) {
12404 if (fill != ' ')
12405 *res++ = sign;
12406 rescnt--;
12407 if (width > len)
12408 width--;
12409 }
12410 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12412 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012413 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12415 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012416 }
12417 rescnt -= 2;
12418 width -= 2;
12419 if (width < 0)
12420 width = 0;
12421 len -= 2;
12422 }
12423 if (width > len && !(flags & F_LJUST)) {
12424 do {
12425 --rescnt;
12426 *res++ = fill;
12427 } while (--width > len);
12428 }
12429 if (fill == ' ') {
12430 if (sign)
12431 *res++ = sign;
12432 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12434 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12435 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12436 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012437 }
12438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 /* Copy all characters, preserving len */
12440 len1 = len;
12441 while (len1--) {
12442 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12443 rescnt--;
12444 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012445 while (--width >= len) {
12446 --rescnt;
12447 *res++ = ' ';
12448 }
12449 if (dict && (argidx < arglen) && c != '%') {
12450 PyErr_SetString(PyExc_TypeError,
12451 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012452 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 goto onError;
12454 }
12455 Py_XDECREF(temp);
12456 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457 } /* until end */
12458 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012459 PyErr_SetString(PyExc_TypeError,
12460 "not all arguments converted during string formatting");
12461 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462 }
12463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464
12465 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12466 if (*res > max)
12467 max = *res;
12468 result = PyUnicode_New(reslen - rescnt, max);
12469 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 kind = PyUnicode_KIND(result);
12472 for (res = res0; res < res0+reslen-rescnt; res++)
12473 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12474 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012476 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477 }
12478 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479 return (PyObject *)result;
12480
Benjamin Peterson29060642009-01-31 22:14:21 +000012481 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483 Py_DECREF(uformat);
12484 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012485 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486 }
12487 return NULL;
12488}
12489
Jeremy Hylton938ace62002-07-17 16:30:39 +000012490static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012491unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12492
Tim Peters6d6c1a32001-08-02 04:15:00 +000012493static PyObject *
12494unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12495{
Benjamin Peterson29060642009-01-31 22:14:21 +000012496 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012497 static char *kwlist[] = {"object", "encoding", "errors", 0};
12498 char *encoding = NULL;
12499 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012500
Benjamin Peterson14339b62009-01-31 16:36:08 +000012501 if (type != &PyUnicode_Type)
12502 return unicode_subtype_new(type, args, kwds);
12503 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012504 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012505 return NULL;
12506 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012508 if (encoding == NULL && errors == NULL)
12509 return PyObject_Str(x);
12510 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012511 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012512}
12513
Guido van Rossume023fe02001-08-30 03:12:59 +000012514static PyObject *
12515unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12516{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012517 PyUnicodeObject *unicode, *self;
12518 Py_ssize_t length, char_size;
12519 int share_wstr, share_utf8;
12520 unsigned int kind;
12521 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012522
Benjamin Peterson14339b62009-01-31 16:36:08 +000012523 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012524
12525 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12526 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012527 return NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012528 assert(PyUnicode_Check(unicode));
12529 if (PyUnicode_READY(unicode))
12530 return NULL;
12531
12532 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12533 if (self == NULL) {
12534 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012535 return NULL;
12536 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012537 kind = PyUnicode_KIND(unicode);
12538 length = PyUnicode_GET_LENGTH(unicode);
12539
12540 _PyUnicode_LENGTH(self) = length;
12541 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12542 _PyUnicode_STATE(self).interned = 0;
12543 _PyUnicode_STATE(self).kind = kind;
12544 _PyUnicode_STATE(self).compact = 0;
12545 _PyUnicode_STATE(self).ascii = 0;
12546 _PyUnicode_STATE(self).ready = 1;
12547 _PyUnicode_WSTR(self) = NULL;
12548 _PyUnicode_UTF8_LENGTH(self) = 0;
12549 _PyUnicode_UTF8(self) = NULL;
12550 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012551 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012552
12553 share_utf8 = 0;
12554 share_wstr = 0;
12555 if (kind == PyUnicode_1BYTE_KIND) {
12556 char_size = 1;
12557 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12558 share_utf8 = 1;
12559 }
12560 else if (kind == PyUnicode_2BYTE_KIND) {
12561 char_size = 2;
12562 if (sizeof(wchar_t) == 2)
12563 share_wstr = 1;
12564 }
12565 else {
12566 assert(kind == PyUnicode_4BYTE_KIND);
12567 char_size = 4;
12568 if (sizeof(wchar_t) == 4)
12569 share_wstr = 1;
12570 }
12571
12572 /* Ensure we won't overflow the length. */
12573 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12574 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012576 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012577 data = PyObject_MALLOC((length + 1) * char_size);
12578 if (data == NULL) {
12579 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 goto onError;
12581 }
12582
Victor Stinnerc3c74152011-10-02 20:39:55 +020012583 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012584 if (share_utf8) {
12585 _PyUnicode_UTF8_LENGTH(self) = length;
12586 _PyUnicode_UTF8(self) = data;
12587 }
12588 if (share_wstr) {
12589 _PyUnicode_WSTR_LENGTH(self) = length;
12590 _PyUnicode_WSTR(self) = (wchar_t *)data;
12591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012592
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012593 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12594 PyUnicode_KIND_SIZE(kind, length + 1));
12595 Py_DECREF(unicode);
12596 return (PyObject *)self;
12597
12598onError:
12599 Py_DECREF(unicode);
12600 Py_DECREF(self);
12601 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012602}
12603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012604PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012605 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012606\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012607Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012608encoding defaults to the current default string encoding.\n\
12609errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012610
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012611static PyObject *unicode_iter(PyObject *seq);
12612
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012614 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012615 "str", /* tp_name */
12616 sizeof(PyUnicodeObject), /* tp_size */
12617 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012619 (destructor)unicode_dealloc, /* tp_dealloc */
12620 0, /* tp_print */
12621 0, /* tp_getattr */
12622 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012623 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012624 unicode_repr, /* tp_repr */
12625 &unicode_as_number, /* tp_as_number */
12626 &unicode_as_sequence, /* tp_as_sequence */
12627 &unicode_as_mapping, /* tp_as_mapping */
12628 (hashfunc) unicode_hash, /* tp_hash*/
12629 0, /* tp_call*/
12630 (reprfunc) unicode_str, /* tp_str */
12631 PyObject_GenericGetAttr, /* tp_getattro */
12632 0, /* tp_setattro */
12633 0, /* tp_as_buffer */
12634 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012635 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012636 unicode_doc, /* tp_doc */
12637 0, /* tp_traverse */
12638 0, /* tp_clear */
12639 PyUnicode_RichCompare, /* tp_richcompare */
12640 0, /* tp_weaklistoffset */
12641 unicode_iter, /* tp_iter */
12642 0, /* tp_iternext */
12643 unicode_methods, /* tp_methods */
12644 0, /* tp_members */
12645 0, /* tp_getset */
12646 &PyBaseObject_Type, /* tp_base */
12647 0, /* tp_dict */
12648 0, /* tp_descr_get */
12649 0, /* tp_descr_set */
12650 0, /* tp_dictoffset */
12651 0, /* tp_init */
12652 0, /* tp_alloc */
12653 unicode_new, /* tp_new */
12654 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655};
12656
12657/* Initialize the Unicode implementation */
12658
Thomas Wouters78890102000-07-22 19:25:51 +000012659void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012661 int i;
12662
Thomas Wouters477c8d52006-05-27 19:21:47 +000012663 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012665 0x000A, /* LINE FEED */
12666 0x000D, /* CARRIAGE RETURN */
12667 0x001C, /* FILE SEPARATOR */
12668 0x001D, /* GROUP SEPARATOR */
12669 0x001E, /* RECORD SEPARATOR */
12670 0x0085, /* NEXT LINE */
12671 0x2028, /* LINE SEPARATOR */
12672 0x2029, /* PARAGRAPH SEPARATOR */
12673 };
12674
Fred Drakee4315f52000-05-09 19:53:39 +000012675 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012676 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012677 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012679
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012680 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012681 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012682 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012683 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012684
12685 /* initialize the linebreak bloom filter */
12686 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012688 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012689
12690 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691}
12692
12693/* Finalize the Unicode implementation */
12694
Christian Heimesa156e092008-02-16 07:38:31 +000012695int
12696PyUnicode_ClearFreeList(void)
12697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012699}
12700
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701void
Thomas Wouters78890102000-07-22 19:25:51 +000012702_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012704 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012706 Py_XDECREF(unicode_empty);
12707 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012708
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012709 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012710 if (unicode_latin1[i]) {
12711 Py_DECREF(unicode_latin1[i]);
12712 unicode_latin1[i] = NULL;
12713 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012714 }
Christian Heimesa156e092008-02-16 07:38:31 +000012715 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012717
Walter Dörwald16807132007-05-25 13:52:07 +000012718void
12719PyUnicode_InternInPlace(PyObject **p)
12720{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012721 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12722 PyObject *t;
12723 if (s == NULL || !PyUnicode_Check(s))
12724 Py_FatalError(
12725 "PyUnicode_InternInPlace: unicode strings only please!");
12726 /* If it's a subclass, we don't really know what putting
12727 it in the interned dict might do. */
12728 if (!PyUnicode_CheckExact(s))
12729 return;
12730 if (PyUnicode_CHECK_INTERNED(s))
12731 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 if (PyUnicode_READY(s) == -1) {
12733 assert(0 && "ready fail in intern...");
12734 return;
12735 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012736 if (interned == NULL) {
12737 interned = PyDict_New();
12738 if (interned == NULL) {
12739 PyErr_Clear(); /* Don't leave an exception */
12740 return;
12741 }
12742 }
12743 /* It might be that the GetItem call fails even
12744 though the key is present in the dictionary,
12745 namely when this happens during a stack overflow. */
12746 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012747 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012748 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012749
Benjamin Peterson29060642009-01-31 22:14:21 +000012750 if (t) {
12751 Py_INCREF(t);
12752 Py_DECREF(*p);
12753 *p = t;
12754 return;
12755 }
Walter Dörwald16807132007-05-25 13:52:07 +000012756
Benjamin Peterson14339b62009-01-31 16:36:08 +000012757 PyThreadState_GET()->recursion_critical = 1;
12758 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12759 PyErr_Clear();
12760 PyThreadState_GET()->recursion_critical = 0;
12761 return;
12762 }
12763 PyThreadState_GET()->recursion_critical = 0;
12764 /* The two references in interned are not counted by refcnt.
12765 The deallocator will take care of this */
12766 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012768}
12769
12770void
12771PyUnicode_InternImmortal(PyObject **p)
12772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12774
Benjamin Peterson14339b62009-01-31 16:36:08 +000012775 PyUnicode_InternInPlace(p);
12776 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012778 Py_INCREF(*p);
12779 }
Walter Dörwald16807132007-05-25 13:52:07 +000012780}
12781
12782PyObject *
12783PyUnicode_InternFromString(const char *cp)
12784{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012785 PyObject *s = PyUnicode_FromString(cp);
12786 if (s == NULL)
12787 return NULL;
12788 PyUnicode_InternInPlace(&s);
12789 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012790}
12791
Alexander Belopolsky40018472011-02-26 01:02:56 +000012792void
12793_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012794{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012795 PyObject *keys;
12796 PyUnicodeObject *s;
12797 Py_ssize_t i, n;
12798 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012799
Benjamin Peterson14339b62009-01-31 16:36:08 +000012800 if (interned == NULL || !PyDict_Check(interned))
12801 return;
12802 keys = PyDict_Keys(interned);
12803 if (keys == NULL || !PyList_Check(keys)) {
12804 PyErr_Clear();
12805 return;
12806 }
Walter Dörwald16807132007-05-25 13:52:07 +000012807
Benjamin Peterson14339b62009-01-31 16:36:08 +000012808 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12809 detector, interned unicode strings are not forcibly deallocated;
12810 rather, we give them their stolen references back, and then clear
12811 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012812
Benjamin Peterson14339b62009-01-31 16:36:08 +000012813 n = PyList_GET_SIZE(keys);
12814 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012816 for (i = 0; i < n; i++) {
12817 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 if (PyUnicode_READY(s) == -1)
12819 fprintf(stderr, "could not ready string\n");
12820 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012821 case SSTATE_NOT_INTERNED:
12822 /* XXX Shouldn't happen */
12823 break;
12824 case SSTATE_INTERNED_IMMORTAL:
12825 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012827 break;
12828 case SSTATE_INTERNED_MORTAL:
12829 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012831 break;
12832 default:
12833 Py_FatalError("Inconsistent interned string state.");
12834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012836 }
12837 fprintf(stderr, "total size of all interned strings: "
12838 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12839 "mortal/immortal\n", mortal_size, immortal_size);
12840 Py_DECREF(keys);
12841 PyDict_Clear(interned);
12842 Py_DECREF(interned);
12843 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012844}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012845
12846
12847/********************* Unicode Iterator **************************/
12848
12849typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012850 PyObject_HEAD
12851 Py_ssize_t it_index;
12852 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012853} unicodeiterobject;
12854
12855static void
12856unicodeiter_dealloc(unicodeiterobject *it)
12857{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012858 _PyObject_GC_UNTRACK(it);
12859 Py_XDECREF(it->it_seq);
12860 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012861}
12862
12863static int
12864unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12865{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012866 Py_VISIT(it->it_seq);
12867 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012868}
12869
12870static PyObject *
12871unicodeiter_next(unicodeiterobject *it)
12872{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012873 PyUnicodeObject *seq;
12874 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012875
Benjamin Peterson14339b62009-01-31 16:36:08 +000012876 assert(it != NULL);
12877 seq = it->it_seq;
12878 if (seq == NULL)
12879 return NULL;
12880 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012882 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12883 int kind = PyUnicode_KIND(seq);
12884 void *data = PyUnicode_DATA(seq);
12885 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12886 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012887 if (item != NULL)
12888 ++it->it_index;
12889 return item;
12890 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012891
Benjamin Peterson14339b62009-01-31 16:36:08 +000012892 Py_DECREF(seq);
12893 it->it_seq = NULL;
12894 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012895}
12896
12897static PyObject *
12898unicodeiter_len(unicodeiterobject *it)
12899{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012900 Py_ssize_t len = 0;
12901 if (it->it_seq)
12902 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12903 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012904}
12905
12906PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12907
12908static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012909 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012910 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012911 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012912};
12913
12914PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012915 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12916 "str_iterator", /* tp_name */
12917 sizeof(unicodeiterobject), /* tp_basicsize */
12918 0, /* tp_itemsize */
12919 /* methods */
12920 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12921 0, /* tp_print */
12922 0, /* tp_getattr */
12923 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012924 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012925 0, /* tp_repr */
12926 0, /* tp_as_number */
12927 0, /* tp_as_sequence */
12928 0, /* tp_as_mapping */
12929 0, /* tp_hash */
12930 0, /* tp_call */
12931 0, /* tp_str */
12932 PyObject_GenericGetAttr, /* tp_getattro */
12933 0, /* tp_setattro */
12934 0, /* tp_as_buffer */
12935 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12936 0, /* tp_doc */
12937 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12938 0, /* tp_clear */
12939 0, /* tp_richcompare */
12940 0, /* tp_weaklistoffset */
12941 PyObject_SelfIter, /* tp_iter */
12942 (iternextfunc)unicodeiter_next, /* tp_iternext */
12943 unicodeiter_methods, /* tp_methods */
12944 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012945};
12946
12947static PyObject *
12948unicode_iter(PyObject *seq)
12949{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012950 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012951
Benjamin Peterson14339b62009-01-31 16:36:08 +000012952 if (!PyUnicode_Check(seq)) {
12953 PyErr_BadInternalCall();
12954 return NULL;
12955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 if (PyUnicode_READY(seq) == -1)
12957 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012958 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12959 if (it == NULL)
12960 return NULL;
12961 it->it_index = 0;
12962 Py_INCREF(seq);
12963 it->it_seq = (PyUnicodeObject *)seq;
12964 _PyObject_GC_TRACK(it);
12965 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012966}
12967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968#define UNIOP(x) Py_UNICODE_##x
12969#define UNIOP_t Py_UNICODE
12970#include "uniops.h"
12971#undef UNIOP
12972#undef UNIOP_t
12973#define UNIOP(x) Py_UCS4_##x
12974#define UNIOP_t Py_UCS4
12975#include "uniops.h"
12976#undef UNIOP
12977#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012978
Victor Stinner71133ff2010-09-01 23:43:53 +000012979Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012980PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012981{
12982 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12983 Py_UNICODE *copy;
12984 Py_ssize_t size;
12985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 if (!PyUnicode_Check(unicode)) {
12987 PyErr_BadArgument();
12988 return NULL;
12989 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012990 /* Ensure we won't overflow the size. */
12991 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12992 PyErr_NoMemory();
12993 return NULL;
12994 }
12995 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12996 size *= sizeof(Py_UNICODE);
12997 copy = PyMem_Malloc(size);
12998 if (copy == NULL) {
12999 PyErr_NoMemory();
13000 return NULL;
13001 }
13002 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13003 return copy;
13004}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013005
Georg Brandl66c221e2010-10-14 07:04:07 +000013006/* A _string module, to export formatter_parser and formatter_field_name_split
13007 to the string.Formatter class implemented in Python. */
13008
13009static PyMethodDef _string_methods[] = {
13010 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13011 METH_O, PyDoc_STR("split the argument as a field name")},
13012 {"formatter_parser", (PyCFunction) formatter_parser,
13013 METH_O, PyDoc_STR("parse the argument as a format string")},
13014 {NULL, NULL}
13015};
13016
13017static struct PyModuleDef _string_module = {
13018 PyModuleDef_HEAD_INIT,
13019 "_string",
13020 PyDoc_STR("string helper module"),
13021 0,
13022 _string_methods,
13023 NULL,
13024 NULL,
13025 NULL,
13026 NULL
13027};
13028
13029PyMODINIT_FUNC
13030PyInit__string(void)
13031{
13032 return PyModule_Create(&_string_module);
13033}
13034
13035
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013036#ifdef __cplusplus
13037}
13038#endif