blob: fed2cc9fb4f0faa1518e2094b23d0f6f0a2ff8b7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200108#define _PyUnicode_UTF8(op) \
109 (PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 ((PyCompactUnicodeObject*)(op))->utf8)
112#define _PyUnicode_UTF8_LENGTH(op) \
113 (PyUnicode_IS_COMPACT_ASCII(op) ? \
114 ((PyASCIIObject*)(op))->length : \
115 ((PyCompactUnicodeObject*)(op))->utf8_length)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200116#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
117#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
119#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
120#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
121#define _PyUnicode_KIND(op) \
122 (assert(PyUnicode_Check(op)), \
123 ((PyASCIIObject *)(op))->state.kind)
124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(PyUnicode_Check(op)), \
126 ((PyASCIIObject *)(op))->length)
127
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200128/* The Unicode string has been modified: reset the hash */
129#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131
Walter Dörwald16807132007-05-25 13:52:07 +0000132/* This dictionary holds all interned unicode strings. Note that references
133 to strings in this dictionary are *not* counted in the string's ob_refcnt.
134 When the interned string reaches a refcnt of 0 the string deallocation
135 function will delete the reference from this dictionary.
136
137 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000138 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000139*/
140static PyObject *interned;
141
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000142/* The empty Unicode object is shared to improve performance. */
143static PyUnicodeObject *unicode_empty;
144
145/* Single character Unicode strings in the Latin-1 range are being
146 shared as well. */
147static PyUnicodeObject *unicode_latin1[256];
148
Christian Heimes190d79e2008-01-30 11:58:22 +0000149/* Fast detection of the most frequent whitespace characters */
150const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000151 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000152/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000154/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000155/* case 0x000C: * FORM FEED */
156/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000157 0, 1, 1, 1, 1, 1, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* case 0x001C: * FILE SEPARATOR */
160/* case 0x001D: * GROUP SEPARATOR */
161/* case 0x001E: * RECORD SEPARATOR */
162/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000163 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000165 1, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000169
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000178};
179
Alexander Belopolsky40018472011-02-26 01:02:56 +0000180static PyObject *
181unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000182 PyObject **errorHandler,const char *encoding, const char *reason,
183 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
184 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
185
Alexander Belopolsky40018472011-02-26 01:02:56 +0000186static void
187raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300188 const char *encoding,
189 const Py_UNICODE *unicode, Py_ssize_t size,
190 Py_ssize_t startpos, Py_ssize_t endpos,
191 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000192
Christian Heimes190d79e2008-01-30 11:58:22 +0000193/* Same for linebreaks */
194static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* 0x000B, * LINE TABULATION */
198/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000202/* 0x001C, * FILE SEPARATOR */
203/* 0x001D, * GROUP SEPARATOR */
204/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000210
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000219};
220
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300221/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
222 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000223Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000224PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000225{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000226#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000228#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 /* This is actually an illegal character, so it should
230 not be passed to unichr. */
231 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232#endif
233}
234
Thomas Wouters477c8d52006-05-27 19:21:47 +0000235/* --- Bloom Filters ----------------------------------------------------- */
236
237/* stuff to implement simple "bloom filters" for Unicode characters.
238 to keep things simple, we use a single bitmask, using the least 5
239 bits from each unicode characters as the bit index. */
240
241/* the linebreak mask is set up by Unicode_Init below */
242
Antoine Pitrouf068f942010-01-13 14:19:12 +0000243#if LONG_BIT >= 128
244#define BLOOM_WIDTH 128
245#elif LONG_BIT >= 64
246#define BLOOM_WIDTH 64
247#elif LONG_BIT >= 32
248#define BLOOM_WIDTH 32
249#else
250#error "LONG_BIT is smaller than 32"
251#endif
252
Thomas Wouters477c8d52006-05-27 19:21:47 +0000253#define BLOOM_MASK unsigned long
254
255static BLOOM_MASK bloom_linebreak;
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
258#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Benjamin Peterson29060642009-01-31 22:14:21 +0000260#define BLOOM_LINEBREAK(ch) \
261 ((ch) < 128U ? ascii_linebreak[(ch)] : \
262 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
Alexander Belopolsky40018472011-02-26 01:02:56 +0000264Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200265make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266{
267 /* calculate simple bloom-style bitmask for a given unicode string */
268
Antoine Pitrouf068f942010-01-13 14:19:12 +0000269 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270 Py_ssize_t i;
271
272 mask = 0;
273 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200274 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000275
276 return mask;
277}
278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279#define BLOOM_MEMBER(mask, chr, str) \
280 (BLOOM(mask, chr) \
281 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283/* --- Unicode Object ----------------------------------------------------- */
284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200286fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
287
288Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
289 Py_ssize_t size, Py_UCS4 ch,
290 int direction)
291{
292 /* like wcschr, but doesn't stop at NULL characters */
293 Py_ssize_t i;
294 if (direction == 1) {
295 for(i = 0; i < size; i++)
296 if (PyUnicode_READ(kind, s, i) == ch)
297 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
298 }
299 else {
300 for(i = size-1; i >= 0; i--)
301 if (PyUnicode_READ(kind, s, i) == ch)
302 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
303 }
304 return NULL;
305}
306
Alexander Belopolsky40018472011-02-26 01:02:56 +0000307static int
308unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200309 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310{
311 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200313 /* Resizing is only supported for old unicode objects. */
314 assert(!PyUnicode_IS_COMPACT(unicode));
315 assert(_PyUnicode_WSTR(unicode) != NULL);
316
317 /* ... and only if they have not been readied yet, because
318 callees usually rely on the wstr representation when resizing. */
319 assert(unicode->data.any == NULL);
320
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000321 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200322 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000325 /* Resizing shared object (unicode_empty or single character
326 objects) in-place is not allowed. Use PyUnicode_Resize()
327 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000328
Benjamin Peterson14339b62009-01-31 16:36:08 +0000329 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200330 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
331 _PyUnicode_WSTR(unicode)[0] < 256U &&
332 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000334 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 return -1;
336 }
337
Thomas Wouters477c8d52006-05-27 19:21:47 +0000338 /* We allocate one more byte to make sure the string is Ux0000 terminated.
339 The overallocation is also used by fastsearch, which assumes that it's
340 safe to look at str[length] (without making any assumptions about what
341 it contains). */
342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200343 oldstr = _PyUnicode_WSTR(unicode);
344 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
345 sizeof(Py_UNICODE) * (length + 1));
346 if (!_PyUnicode_WSTR(unicode)) {
347 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 PyErr_NoMemory();
349 return -1;
350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200351 _PyUnicode_WSTR(unicode)[length] = 0;
352 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200355 if (unicode->data.any != NULL) {
356 PyObject_FREE(unicode->data.any);
357 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
358 PyObject_FREE(unicode->_base.utf8);
359 }
360 unicode->_base.utf8 = NULL;
361 unicode->_base.utf8_length = 0;
362 unicode->data.any = NULL;
363 _PyUnicode_LENGTH(unicode) = 0;
364 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
365 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200367 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000368
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 return 0;
370}
371
372/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000373 Ux0000 terminated; some code (e.g. new_identifier)
374 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375
376 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000377 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378
379*/
380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200381#ifdef Py_DEBUG
382int unicode_old_new_calls = 0;
383#endif
384
Alexander Belopolsky40018472011-02-26 01:02:56 +0000385static PyUnicodeObject *
386_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387{
388 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200389 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000390
Thomas Wouters477c8d52006-05-27 19:21:47 +0000391 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392 if (length == 0 && unicode_empty != NULL) {
393 Py_INCREF(unicode_empty);
394 return unicode_empty;
395 }
396
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000397 /* Ensure we won't overflow the size. */
398 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
399 return (PyUnicodeObject *)PyErr_NoMemory();
400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200401 if (length < 0) {
402 PyErr_SetString(PyExc_SystemError,
403 "Negative size passed to _PyUnicode_New");
404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200407#ifdef Py_DEBUG
408 ++unicode_old_new_calls;
409#endif
410
411 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
412 if (unicode == NULL)
413 return NULL;
414 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
415 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
416 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 PyErr_NoMemory();
418 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200420
Jeremy Hyltond8082792003-09-16 19:41:39 +0000421 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000422 * the caller fails before initializing str -- unicode_resize()
423 * reads str[0], and the Keep-Alive optimization can keep memory
424 * allocated for str alive across a call to unicode_dealloc(unicode).
425 * We don't want unicode_resize to read uninitialized memory in
426 * that case.
427 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200428 _PyUnicode_WSTR(unicode)[0] = 0;
429 _PyUnicode_WSTR(unicode)[length] = 0;
430 _PyUnicode_WSTR_LENGTH(unicode) = length;
431 _PyUnicode_HASH(unicode) = -1;
432 _PyUnicode_STATE(unicode).interned = 0;
433 _PyUnicode_STATE(unicode).kind = 0;
434 _PyUnicode_STATE(unicode).compact = 0;
435 _PyUnicode_STATE(unicode).ready = 0;
436 _PyUnicode_STATE(unicode).ascii = 0;
437 unicode->data.any = NULL;
438 _PyUnicode_LENGTH(unicode) = 0;
439 unicode->_base.utf8 = NULL;
440 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000442
Benjamin Peterson29060642009-01-31 22:14:21 +0000443 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000444 /* XXX UNREF/NEWREF interface should be more symmetrical */
445 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000446 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000447 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000449}
450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200451#ifdef Py_DEBUG
452int unicode_new_new_calls = 0;
453
454/* Functions wrapping macros for use in debugger */
455char *_PyUnicode_utf8(void *unicode){
456 return _PyUnicode_UTF8(unicode);
457}
458
459void *_PyUnicode_compact_data(void *unicode) {
460 return _PyUnicode_COMPACT_DATA(unicode);
461}
462void *_PyUnicode_data(void *unicode){
463 printf("obj %p\n", unicode);
464 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
465 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
466 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
467 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
468 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
469 return PyUnicode_DATA(unicode);
470}
471#endif
472
473PyObject *
474PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
475{
476 PyObject *obj;
477 PyCompactUnicodeObject *unicode;
478 void *data;
479 int kind_state;
480 int is_sharing = 0, is_ascii = 0;
481 Py_ssize_t char_size;
482 Py_ssize_t struct_size;
483
484 /* Optimization for empty strings */
485 if (size == 0 && unicode_empty != NULL) {
486 Py_INCREF(unicode_empty);
487 return (PyObject *)unicode_empty;
488 }
489
490#ifdef Py_DEBUG
491 ++unicode_new_new_calls;
492#endif
493
494 struct_size = sizeof(PyCompactUnicodeObject);
495 if (maxchar < 128) {
496 kind_state = PyUnicode_1BYTE_KIND;
497 char_size = 1;
498 is_ascii = 1;
499 struct_size = sizeof(PyASCIIObject);
500 }
501 else if (maxchar < 256) {
502 kind_state = PyUnicode_1BYTE_KIND;
503 char_size = 1;
504 }
505 else if (maxchar < 65536) {
506 kind_state = PyUnicode_2BYTE_KIND;
507 char_size = 2;
508 if (sizeof(wchar_t) == 2)
509 is_sharing = 1;
510 }
511 else {
512 kind_state = PyUnicode_4BYTE_KIND;
513 char_size = 4;
514 if (sizeof(wchar_t) == 4)
515 is_sharing = 1;
516 }
517
518 /* Ensure we won't overflow the size. */
519 if (size < 0) {
520 PyErr_SetString(PyExc_SystemError,
521 "Negative size passed to PyUnicode_New");
522 return NULL;
523 }
524 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
525 return PyErr_NoMemory();
526
527 /* Duplicated allocation code from _PyObject_New() instead of a call to
528 * PyObject_New() so we are able to allocate space for the object and
529 * it's data buffer.
530 */
531 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
532 if (obj == NULL)
533 return PyErr_NoMemory();
534 obj = PyObject_INIT(obj, &PyUnicode_Type);
535 if (obj == NULL)
536 return NULL;
537
538 unicode = (PyCompactUnicodeObject *)obj;
539 if (is_ascii)
540 data = ((PyASCIIObject*)obj) + 1;
541 else
542 data = unicode + 1;
543 _PyUnicode_LENGTH(unicode) = size;
544 _PyUnicode_HASH(unicode) = -1;
545 _PyUnicode_STATE(unicode).interned = 0;
546 _PyUnicode_STATE(unicode).kind = kind_state;
547 _PyUnicode_STATE(unicode).compact = 1;
548 _PyUnicode_STATE(unicode).ready = 1;
549 _PyUnicode_STATE(unicode).ascii = is_ascii;
550 if (is_ascii) {
551 ((char*)data)[size] = 0;
552 _PyUnicode_WSTR(unicode) = NULL;
553 }
554 else if (kind_state == PyUnicode_1BYTE_KIND) {
555 ((char*)data)[size] = 0;
556 _PyUnicode_WSTR(unicode) = NULL;
557 _PyUnicode_WSTR_LENGTH(unicode) = 0;
558 unicode->utf8_length = 0;
559 unicode->utf8 = NULL;
560 }
561 else {
562 unicode->utf8 = NULL;
563 if (kind_state == PyUnicode_2BYTE_KIND)
564 ((Py_UCS2*)data)[size] = 0;
565 else /* kind_state == PyUnicode_4BYTE_KIND */
566 ((Py_UCS4*)data)[size] = 0;
567 if (is_sharing) {
568 _PyUnicode_WSTR_LENGTH(unicode) = size;
569 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
570 }
571 else {
572 _PyUnicode_WSTR_LENGTH(unicode) = 0;
573 _PyUnicode_WSTR(unicode) = NULL;
574 }
575 }
576 return obj;
577}
578
579#if SIZEOF_WCHAR_T == 2
580/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
581 will decode surrogate pairs, the other conversions are implemented as macros
582 for efficency.
583
584 This function assumes that unicode can hold one more code point than wstr
585 characters for a terminating null character. */
586static int
587unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
588 PyUnicodeObject *unicode)
589{
590 const wchar_t *iter;
591 Py_UCS4 *ucs4_out;
592
593 assert(unicode && PyUnicode_Check(unicode));
594 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
595 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
596
597 for (iter = begin; iter < end; ) {
598 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
599 _PyUnicode_GET_LENGTH(unicode)));
600 if (*iter >= 0xD800 && *iter <= 0xDBFF
601 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
602 {
603 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
604 iter += 2;
605 }
606 else {
607 *ucs4_out++ = *iter;
608 iter++;
609 }
610 }
611 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
612 _PyUnicode_GET_LENGTH(unicode)));
613
614 return 0;
615}
616#endif
617
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200618Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200619PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
620 PyObject *from, Py_ssize_t from_start,
621 Py_ssize_t how_many)
622{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200623 unsigned int from_kind, to_kind;
624 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200625
Victor Stinnerb1536152011-09-30 02:26:10 +0200626 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
627 PyErr_BadInternalCall();
628 return -1;
629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200630
631 if (PyUnicode_READY(from))
632 return -1;
633 if (PyUnicode_READY(to))
634 return -1;
635
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200636 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200637 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
638 PyErr_Format(PyExc_ValueError,
639 "Cannot write %zi characters at %zi "
640 "in a string of %zi characters",
641 how_many, to_start, PyUnicode_GET_LENGTH(to));
642 return -1;
643 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200644 if (how_many == 0)
645 return 0;
646
647 if (Py_REFCNT(to) != 1) {
648 PyErr_SetString(PyExc_ValueError,
649 "Cannot modify a string having more than 1 reference");
650 return -1;
651 }
Victor Stinnerc17f5402011-09-29 00:16:58 +0200652 _PyUnicode_DIRTY(to);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200655 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200656 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200657 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200658
659 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200660 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200661 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200662 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200663 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200664 + PyUnicode_KIND_SIZE(from_kind, from_start),
665 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200666 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200667 else if (from_kind == PyUnicode_1BYTE_KIND
668 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200669 {
670 _PyUnicode_CONVERT_BYTES(
671 Py_UCS1, Py_UCS2,
672 PyUnicode_1BYTE_DATA(from) + from_start,
673 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
674 PyUnicode_2BYTE_DATA(to) + to_start
675 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200676 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200677 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200678 && to_kind == PyUnicode_4BYTE_KIND)
679 {
680 _PyUnicode_CONVERT_BYTES(
681 Py_UCS1, Py_UCS4,
682 PyUnicode_1BYTE_DATA(from) + from_start,
683 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
684 PyUnicode_4BYTE_DATA(to) + to_start
685 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200686 }
687 else if (from_kind == PyUnicode_2BYTE_KIND
688 && to_kind == PyUnicode_4BYTE_KIND)
689 {
690 _PyUnicode_CONVERT_BYTES(
691 Py_UCS2, Py_UCS4,
692 PyUnicode_2BYTE_DATA(from) + from_start,
693 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
694 PyUnicode_4BYTE_DATA(to) + to_start
695 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200696 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200697 else {
698 int invalid_kinds;
699 if (from_kind > to_kind) {
700 /* slow path to check for character overflow */
701 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
702 Py_UCS4 ch, maxchar;
703 Py_ssize_t i;
704
705 maxchar = 0;
706 invalid_kinds = 0;
707 for (i=0; i < how_many; i++) {
708 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
709 if (ch > maxchar) {
710 maxchar = ch;
711 if (maxchar > to_maxchar) {
712 invalid_kinds = 1;
713 break;
714 }
715 }
716 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
717 }
718 }
719 else
720 invalid_kinds = 1;
721 if (invalid_kinds) {
722 PyErr_Format(PyExc_ValueError,
723 "Cannot copy UCS%u characters "
724 "into a string of UCS%u characters",
725 1 << (from_kind - 1),
726 1 << (to_kind -1));
727 return -1;
728 }
729 }
730 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200731}
732
Victor Stinner17222162011-09-28 22:15:37 +0200733/* Find the maximum code point and count the number of surrogate pairs so a
734 correct string length can be computed before converting a string to UCS4.
735 This function counts single surrogates as a character and not as a pair.
736
737 Return 0 on success, or -1 on error. */
738static int
739find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
740 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200741{
742 const wchar_t *iter;
743
744 if (num_surrogates == NULL || maxchar == NULL) {
745 PyErr_SetString(PyExc_SystemError,
746 "unexpected NULL arguments to "
747 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
748 return -1;
749 }
750
751 *num_surrogates = 0;
752 *maxchar = 0;
753
754 for (iter = begin; iter < end; ) {
755 if (*iter > *maxchar)
756 *maxchar = *iter;
757#if SIZEOF_WCHAR_T == 2
758 if (*iter >= 0xD800 && *iter <= 0xDBFF
759 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
760 {
761 Py_UCS4 surrogate_val;
762 surrogate_val = (((iter[0] & 0x3FF)<<10)
763 | (iter[1] & 0x3FF)) + 0x10000;
764 ++(*num_surrogates);
765 if (surrogate_val > *maxchar)
766 *maxchar = surrogate_val;
767 iter += 2;
768 }
769 else
770 iter++;
771#else
772 iter++;
773#endif
774 }
775 return 0;
776}
777
778#ifdef Py_DEBUG
779int unicode_ready_calls = 0;
780#endif
781
782int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200783_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200784{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200785 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200786 wchar_t *end;
787 Py_UCS4 maxchar = 0;
788 Py_ssize_t num_surrogates;
789#if SIZEOF_WCHAR_T == 2
790 Py_ssize_t length_wo_surrogates;
791#endif
792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200793 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200794 strings were created using _PyObject_New() and where no canonical
795 representation (the str field) has been set yet aka strings
796 which are not yet ready. */
797 assert(PyUnicode_Check(obj));
798 assert(!PyUnicode_IS_READY(obj));
799 assert(!PyUnicode_IS_COMPACT(obj));
800 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200802 assert(unicode->data.any == NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803 assert(unicode->_base.utf8 == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200804 /* Actually, it should neither be interned nor be anything else: */
805 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200806
807#ifdef Py_DEBUG
808 ++unicode_ready_calls;
809#endif
810
811 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200812 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200813 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200814 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200815
816 if (maxchar < 256) {
817 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
818 if (!unicode->data.any) {
819 PyErr_NoMemory();
820 return -1;
821 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200822 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200823 _PyUnicode_WSTR(unicode), end,
824 PyUnicode_1BYTE_DATA(unicode));
825 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
826 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
827 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
828 if (maxchar < 128) {
829 unicode->_base.utf8 = unicode->data.any;
830 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
831 }
832 else {
833 unicode->_base.utf8 = NULL;
834 unicode->_base.utf8_length = 0;
835 }
836 PyObject_FREE(_PyUnicode_WSTR(unicode));
837 _PyUnicode_WSTR(unicode) = NULL;
838 _PyUnicode_WSTR_LENGTH(unicode) = 0;
839 }
840 /* In this case we might have to convert down from 4-byte native
841 wchar_t to 2-byte unicode. */
842 else if (maxchar < 65536) {
843 assert(num_surrogates == 0 &&
844 "FindMaxCharAndNumSurrogatePairs() messed up");
845
Victor Stinner506f5922011-09-28 22:34:18 +0200846#if SIZEOF_WCHAR_T == 2
847 /* We can share representations and are done. */
848 unicode->data.any = _PyUnicode_WSTR(unicode);
849 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
850 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
851 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
852 unicode->_base.utf8 = NULL;
853 unicode->_base.utf8_length = 0;
854#else
855 /* sizeof(wchar_t) == 4 */
856 unicode->data.any = PyObject_MALLOC(
857 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
858 if (!unicode->data.any) {
859 PyErr_NoMemory();
860 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 }
Victor Stinner506f5922011-09-28 22:34:18 +0200862 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
863 _PyUnicode_WSTR(unicode), end,
864 PyUnicode_2BYTE_DATA(unicode));
865 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
866 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
867 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
868 unicode->_base.utf8 = NULL;
869 unicode->_base.utf8_length = 0;
870 PyObject_FREE(_PyUnicode_WSTR(unicode));
871 _PyUnicode_WSTR(unicode) = NULL;
872 _PyUnicode_WSTR_LENGTH(unicode) = 0;
873#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874 }
875 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
876 else {
877#if SIZEOF_WCHAR_T == 2
878 /* in case the native representation is 2-bytes, we need to allocate a
879 new normalized 4-byte version. */
880 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
881 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
882 if (!unicode->data.any) {
883 PyErr_NoMemory();
884 return -1;
885 }
886 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
887 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
888 unicode->_base.utf8 = NULL;
889 unicode->_base.utf8_length = 0;
890 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
891 unicode) < 0) {
892 assert(0 && "ConvertWideCharToUCS4 failed");
893 return -1;
894 }
895 PyObject_FREE(_PyUnicode_WSTR(unicode));
896 _PyUnicode_WSTR(unicode) = NULL;
897 _PyUnicode_WSTR_LENGTH(unicode) = 0;
898#else
899 assert(num_surrogates == 0);
900
901 unicode->data.any = _PyUnicode_WSTR(unicode);
902 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
903 unicode->_base.utf8 = NULL;
904 unicode->_base.utf8_length = 0;
905 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
906#endif
907 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
908 }
909 _PyUnicode_STATE(unicode).ready = 1;
910 return 0;
911}
912
Alexander Belopolsky40018472011-02-26 01:02:56 +0000913static void
914unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000915{
Walter Dörwald16807132007-05-25 13:52:07 +0000916 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000917 case SSTATE_NOT_INTERNED:
918 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000919
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 case SSTATE_INTERNED_MORTAL:
921 /* revive dead object temporarily for DelItem */
922 Py_REFCNT(unicode) = 3;
923 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
924 Py_FatalError(
925 "deletion of interned string failed");
926 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000927
Benjamin Peterson29060642009-01-31 22:14:21 +0000928 case SSTATE_INTERNED_IMMORTAL:
929 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000930
Benjamin Peterson29060642009-01-31 22:14:21 +0000931 default:
932 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000933 }
934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200935 if (_PyUnicode_WSTR(unicode) &&
936 (!PyUnicode_IS_READY(unicode) ||
937 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
938 PyObject_DEL(_PyUnicode_WSTR(unicode));
939 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
940 PyObject_DEL(unicode->_base.utf8);
941
942 if (PyUnicode_IS_COMPACT(unicode)) {
943 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944 }
945 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200946 if (unicode->data.any)
947 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000949 }
950}
951
Alexander Belopolsky40018472011-02-26 01:02:56 +0000952static int
953_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000954{
955 register PyUnicodeObject *v;
956
957 /* Argument checks */
958 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000959 PyErr_BadInternalCall();
960 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000961 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000962 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
964 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000965 PyErr_BadInternalCall();
966 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000967 }
968
969 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970 possible since these are being shared.
971 The same goes for new-representation unicode objects or objects which
972 have already been readied.
973 For these, we simply return a fresh copy with the same Unicode content.
974 */
975 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
976 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
977 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000978 PyUnicodeObject *w = _PyUnicode_New(length);
979 if (w == NULL)
980 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
982 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000983 Py_DECREF(*unicode);
984 *unicode = w;
985 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000986 }
987
988 /* Note that we don't have to modify *unicode for unshared Unicode
989 objects, since we can modify them in-place. */
990 return unicode_resize(v, length);
991}
992
Alexander Belopolsky40018472011-02-26 01:02:56 +0000993int
994PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000995{
996 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
997}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999static PyObject*
1000get_latin1_char(unsigned char ch)
1001{
1002 PyUnicodeObject *unicode = unicode_latin1[ch];
1003 if (!unicode) {
1004 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1005 if (!unicode)
1006 return NULL;
1007 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1008 unicode_latin1[ch] = unicode;
1009 }
1010 Py_INCREF(unicode);
1011 return (PyObject *)unicode;
1012}
1013
Alexander Belopolsky40018472011-02-26 01:02:56 +00001014PyObject *
1015PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001016{
1017 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001018 Py_UCS4 maxchar = 0;
1019 Py_ssize_t num_surrogates;
1020
1021 if (u == NULL)
1022 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001024 /* If the Unicode data is known at construction time, we can apply
1025 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027 /* Optimization for empty strings */
1028 if (size == 0 && unicode_empty != NULL) {
1029 Py_INCREF(unicode_empty);
1030 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001031 }
Tim Petersced69f82003-09-16 20:30:58 +00001032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033 /* Single character Unicode objects in the Latin-1 range are
1034 shared when using this constructor */
1035 if (size == 1 && *u < 256)
1036 return get_latin1_char((unsigned char)*u);
1037
1038 /* If not empty and not single character, copy the Unicode data
1039 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001040 if (find_maxchar_surrogates(u, u + size,
1041 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001042 return NULL;
1043
1044 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1045 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046 if (!unicode)
1047 return NULL;
1048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 switch (PyUnicode_KIND(unicode)) {
1050 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001051 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1053 break;
1054 case PyUnicode_2BYTE_KIND:
1055#if Py_UNICODE_SIZE == 2
1056 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1057#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001058 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1060#endif
1061 break;
1062 case PyUnicode_4BYTE_KIND:
1063#if SIZEOF_WCHAR_T == 2
1064 /* This is the only case which has to process surrogates, thus
1065 a simple copy loop is not enough and we need a function. */
1066 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1067 Py_DECREF(unicode);
1068 return NULL;
1069 }
1070#else
1071 assert(num_surrogates == 0);
1072 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1073#endif
1074 break;
1075 default:
1076 assert(0 && "Impossible state");
1077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079 return (PyObject *)unicode;
1080}
1081
Alexander Belopolsky40018472011-02-26 01:02:56 +00001082PyObject *
1083PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001084{
1085 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001086
Benjamin Peterson14339b62009-01-31 16:36:08 +00001087 if (size < 0) {
1088 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001089 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001090 return NULL;
1091 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001092
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001093 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001094 some optimizations which share commonly used objects.
1095 Also, this means the input must be UTF-8, so fall back to the
1096 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001097 if (u != NULL) {
1098
Benjamin Peterson29060642009-01-31 22:14:21 +00001099 /* Optimization for empty strings */
1100 if (size == 0 && unicode_empty != NULL) {
1101 Py_INCREF(unicode_empty);
1102 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001103 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001104
1105 /* Single characters are shared when using this constructor.
1106 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 if (size == 1 && Py_CHARMASK(*u) < 128)
1108 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001109
1110 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001111 }
1112
Walter Dörwald55507312007-05-18 13:12:10 +00001113 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001114 if (!unicode)
1115 return NULL;
1116
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001117 return (PyObject *)unicode;
1118}
1119
Alexander Belopolsky40018472011-02-26 01:02:56 +00001120PyObject *
1121PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001122{
1123 size_t size = strlen(u);
1124 if (size > PY_SSIZE_T_MAX) {
1125 PyErr_SetString(PyExc_OverflowError, "input too long");
1126 return NULL;
1127 }
1128
1129 return PyUnicode_FromStringAndSize(u, size);
1130}
1131
Victor Stinnere57b1c02011-09-28 22:20:48 +02001132static PyObject*
1133_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001134{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135 PyObject *res;
1136 unsigned char max = 127;
1137 Py_ssize_t i;
1138 for (i = 0; i < size; i++) {
1139 if (u[i] & 0x80) {
1140 max = 255;
1141 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001142 }
1143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 res = PyUnicode_New(size, max);
1145 if (!res)
1146 return NULL;
1147 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1148 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001149}
1150
Victor Stinnere57b1c02011-09-28 22:20:48 +02001151static PyObject*
1152_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153{
1154 PyObject *res;
1155 Py_UCS2 max = 0;
1156 Py_ssize_t i;
1157 for (i = 0; i < size; i++)
1158 if (u[i] > max)
1159 max = u[i];
1160 res = PyUnicode_New(size, max);
1161 if (!res)
1162 return NULL;
1163 if (max >= 256)
1164 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1165 else
1166 for (i = 0; i < size; i++)
1167 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1168 return res;
1169}
1170
Victor Stinnere57b1c02011-09-28 22:20:48 +02001171static PyObject*
1172_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173{
1174 PyObject *res;
1175 Py_UCS4 max = 0;
1176 Py_ssize_t i;
1177 for (i = 0; i < size; i++)
1178 if (u[i] > max)
1179 max = u[i];
1180 res = PyUnicode_New(size, max);
1181 if (!res)
1182 return NULL;
1183 if (max >= 0x10000)
1184 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1185 else {
1186 int kind = PyUnicode_KIND(res);
1187 void *data = PyUnicode_DATA(res);
1188 for (i = 0; i < size; i++)
1189 PyUnicode_WRITE(kind, data, i, u[i]);
1190 }
1191 return res;
1192}
1193
1194PyObject*
1195PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1196{
1197 switch(kind) {
1198 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001199 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001201 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001203 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204 }
1205 assert(0);
1206 return NULL;
1207}
1208
Victor Stinner034f6cf2011-09-30 02:26:44 +02001209PyObject*
1210PyUnicode_Copy(PyObject *unicode)
1211{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001212 Py_ssize_t size;
1213 PyObject *copy;
1214 void *data;
1215
Victor Stinner034f6cf2011-09-30 02:26:44 +02001216 if (!PyUnicode_Check(unicode)) {
1217 PyErr_BadInternalCall();
1218 return NULL;
1219 }
1220 if (PyUnicode_READY(unicode))
1221 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001222
1223 size = PyUnicode_GET_LENGTH(unicode);
1224 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1225 if (!copy)
1226 return NULL;
1227 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1228
1229 data = PyUnicode_DATA(unicode);
1230 switch (PyUnicode_KIND(unicode))
1231 {
1232 case PyUnicode_1BYTE_KIND:
1233 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1234 break;
1235 case PyUnicode_2BYTE_KIND:
1236 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1237 break;
1238 case PyUnicode_4BYTE_KIND:
1239 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1240 break;
1241 default:
1242 assert(0);
1243 break;
1244 }
1245 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001246}
1247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248
1249/* Widen Unicode objects to larger buffers.
1250 Return NULL if the string is too wide already. */
1251
1252void*
1253_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1254{
1255 Py_ssize_t i;
1256 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1257 void *d = PyUnicode_DATA(s);
1258 unsigned int skind = PyUnicode_KIND(s);
1259 if (PyUnicode_KIND(s) >= kind) {
1260 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1261 return NULL;
1262 }
1263 switch(kind) {
1264 case PyUnicode_2BYTE_KIND: {
1265 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1266 if (!result) {
1267 PyErr_NoMemory();
1268 return 0;
1269 }
1270 for (i = 0; i < len; i++)
1271 result[i] = ((Py_UCS1*)d)[i];
1272 return result;
1273 }
1274 case PyUnicode_4BYTE_KIND: {
1275 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1276 if (!result) {
1277 PyErr_NoMemory();
1278 return 0;
1279 }
1280 for (i = 0; i < len; i++)
1281 result[i] = PyUnicode_READ(skind, d, i);
1282 return result;
1283 }
1284 }
1285 Py_FatalError("invalid kind");
1286 return NULL;
1287}
1288
1289static Py_UCS4*
1290as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1291 int copy_null)
1292{
1293 int kind;
1294 void *data;
1295 Py_ssize_t len, targetlen;
1296 if (PyUnicode_READY(string) == -1)
1297 return NULL;
1298 kind = PyUnicode_KIND(string);
1299 data = PyUnicode_DATA(string);
1300 len = PyUnicode_GET_LENGTH(string);
1301 targetlen = len;
1302 if (copy_null)
1303 targetlen++;
1304 if (!target) {
1305 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1306 PyErr_NoMemory();
1307 return NULL;
1308 }
1309 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1310 if (!target) {
1311 PyErr_NoMemory();
1312 return NULL;
1313 }
1314 }
1315 else {
1316 if (targetsize < targetlen) {
1317 PyErr_Format(PyExc_SystemError,
1318 "string is longer than the buffer");
1319 if (copy_null && 0 < targetsize)
1320 target[0] = 0;
1321 return NULL;
1322 }
1323 }
1324 if (kind != PyUnicode_4BYTE_KIND) {
1325 Py_ssize_t i;
1326 for (i = 0; i < len; i++)
1327 target[i] = PyUnicode_READ(kind, data, i);
1328 }
1329 else
1330 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1331 if (copy_null)
1332 target[len] = 0;
1333 return target;
1334}
1335
1336Py_UCS4*
1337PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1338 int copy_null)
1339{
1340 if (target == NULL || targetsize < 1) {
1341 PyErr_BadInternalCall();
1342 return NULL;
1343 }
1344 return as_ucs4(string, target, targetsize, copy_null);
1345}
1346
1347Py_UCS4*
1348PyUnicode_AsUCS4Copy(PyObject *string)
1349{
1350 return as_ucs4(string, NULL, 0, 1);
1351}
1352
1353#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001354
Alexander Belopolsky40018472011-02-26 01:02:56 +00001355PyObject *
1356PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001359 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001361 PyErr_BadInternalCall();
1362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363 }
1364
Martin v. Löwis790465f2008-04-05 20:41:37 +00001365 if (size == -1) {
1366 size = wcslen(w);
1367 }
1368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370}
1371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001373
Walter Dörwald346737f2007-05-31 10:44:43 +00001374static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001375makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1376 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001377{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001378 *fmt++ = '%';
1379 if (width) {
1380 if (zeropad)
1381 *fmt++ = '0';
1382 fmt += sprintf(fmt, "%d", width);
1383 }
1384 if (precision)
1385 fmt += sprintf(fmt, ".%d", precision);
1386 if (longflag)
1387 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001388 else if (longlongflag) {
1389 /* longlongflag should only ever be nonzero on machines with
1390 HAVE_LONG_LONG defined */
1391#ifdef HAVE_LONG_LONG
1392 char *f = PY_FORMAT_LONG_LONG;
1393 while (*f)
1394 *fmt++ = *f++;
1395#else
1396 /* we shouldn't ever get here */
1397 assert(0);
1398 *fmt++ = 'l';
1399#endif
1400 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001401 else if (size_tflag) {
1402 char *f = PY_FORMAT_SIZE_T;
1403 while (*f)
1404 *fmt++ = *f++;
1405 }
1406 *fmt++ = c;
1407 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001408}
1409
Victor Stinner96865452011-03-01 23:44:09 +00001410/* helper for PyUnicode_FromFormatV() */
1411
1412static const char*
1413parse_format_flags(const char *f,
1414 int *p_width, int *p_precision,
1415 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1416{
1417 int width, precision, longflag, longlongflag, size_tflag;
1418
1419 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1420 f++;
1421 width = 0;
1422 while (Py_ISDIGIT((unsigned)*f))
1423 width = (width*10) + *f++ - '0';
1424 precision = 0;
1425 if (*f == '.') {
1426 f++;
1427 while (Py_ISDIGIT((unsigned)*f))
1428 precision = (precision*10) + *f++ - '0';
1429 if (*f == '%') {
1430 /* "%.3%s" => f points to "3" */
1431 f--;
1432 }
1433 }
1434 if (*f == '\0') {
1435 /* bogus format "%.1" => go backward, f points to "1" */
1436 f--;
1437 }
1438 if (p_width != NULL)
1439 *p_width = width;
1440 if (p_precision != NULL)
1441 *p_precision = precision;
1442
1443 /* Handle %ld, %lu, %lld and %llu. */
1444 longflag = 0;
1445 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001446 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001447
1448 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001449 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001450 longflag = 1;
1451 ++f;
1452 }
1453#ifdef HAVE_LONG_LONG
1454 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001455 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001456 longlongflag = 1;
1457 f += 2;
1458 }
1459#endif
1460 }
1461 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001462 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001463 size_tflag = 1;
1464 ++f;
1465 }
1466 if (p_longflag != NULL)
1467 *p_longflag = longflag;
1468 if (p_longlongflag != NULL)
1469 *p_longlongflag = longlongflag;
1470 if (p_size_tflag != NULL)
1471 *p_size_tflag = size_tflag;
1472 return f;
1473}
1474
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001475/* maximum number of characters required for output of %ld. 21 characters
1476 allows for 64-bit integers (in decimal) and an optional sign. */
1477#define MAX_LONG_CHARS 21
1478/* maximum number of characters required for output of %lld.
1479 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1480 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1481#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1482
Walter Dörwaldd2034312007-05-18 16:29:38 +00001483PyObject *
1484PyUnicode_FromFormatV(const char *format, va_list vargs)
1485{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001486 va_list count;
1487 Py_ssize_t callcount = 0;
1488 PyObject **callresults = NULL;
1489 PyObject **callresult = NULL;
1490 Py_ssize_t n = 0;
1491 int width = 0;
1492 int precision = 0;
1493 int zeropad;
1494 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001496 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001497 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1499 Py_UCS4 argmaxchar;
1500 Py_ssize_t numbersize = 0;
1501 char *numberresults = NULL;
1502 char *numberresult = NULL;
1503 Py_ssize_t i;
1504 int kind;
1505 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001506
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001507 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001508 /* step 1: count the number of %S/%R/%A/%s format specifications
1509 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1510 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 * result in an array)
1512 * also esimate a upper bound for all the number formats in the string,
1513 * numbers will be formated in step 3 and be keept in a '\0'-separated
1514 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001515 for (f = format; *f; f++) {
1516 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001517 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1519 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1520 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1521 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001524#ifdef HAVE_LONG_LONG
1525 if (longlongflag) {
1526 if (width < MAX_LONG_LONG_CHARS)
1527 width = MAX_LONG_LONG_CHARS;
1528 }
1529 else
1530#endif
1531 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1532 including sign. Decimal takes the most space. This
1533 isn't enough for octal. If a width is specified we
1534 need more (which we allocate later). */
1535 if (width < MAX_LONG_CHARS)
1536 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537
1538 /* account for the size + '\0' to separate numbers
1539 inside of the numberresults buffer */
1540 numbersize += (width + 1);
1541 }
1542 }
1543 else if ((unsigned char)*f > 127) {
1544 PyErr_Format(PyExc_ValueError,
1545 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1546 "string, got a non-ASCII byte: 0x%02x",
1547 (unsigned char)*f);
1548 return NULL;
1549 }
1550 }
1551 /* step 2: allocate memory for the results of
1552 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1553 if (callcount) {
1554 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1555 if (!callresults) {
1556 PyErr_NoMemory();
1557 return NULL;
1558 }
1559 callresult = callresults;
1560 }
1561 /* step 2.5: allocate memory for the results of formating numbers */
1562 if (numbersize) {
1563 numberresults = PyObject_Malloc(numbersize);
1564 if (!numberresults) {
1565 PyErr_NoMemory();
1566 goto fail;
1567 }
1568 numberresult = numberresults;
1569 }
1570
1571 /* step 3: format numbers and figure out how large a buffer we need */
1572 for (f = format; *f; f++) {
1573 if (*f == '%') {
1574 const char* p;
1575 int longflag;
1576 int longlongflag;
1577 int size_tflag;
1578 int numprinted;
1579
1580 p = f;
1581 zeropad = (f[1] == '0');
1582 f = parse_format_flags(f, &width, &precision,
1583 &longflag, &longlongflag, &size_tflag);
1584 switch (*f) {
1585 case 'c':
1586 {
1587 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001588 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 n++;
1590 break;
1591 }
1592 case '%':
1593 n++;
1594 break;
1595 case 'i':
1596 case 'd':
1597 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1598 width, precision, *f);
1599 if (longflag)
1600 numprinted = sprintf(numberresult, fmt,
1601 va_arg(count, long));
1602#ifdef HAVE_LONG_LONG
1603 else if (longlongflag)
1604 numprinted = sprintf(numberresult, fmt,
1605 va_arg(count, PY_LONG_LONG));
1606#endif
1607 else if (size_tflag)
1608 numprinted = sprintf(numberresult, fmt,
1609 va_arg(count, Py_ssize_t));
1610 else
1611 numprinted = sprintf(numberresult, fmt,
1612 va_arg(count, int));
1613 n += numprinted;
1614 /* advance by +1 to skip over the '\0' */
1615 numberresult += (numprinted + 1);
1616 assert(*(numberresult - 1) == '\0');
1617 assert(*(numberresult - 2) != '\0');
1618 assert(numprinted >= 0);
1619 assert(numberresult <= numberresults + numbersize);
1620 break;
1621 case 'u':
1622 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1623 width, precision, 'u');
1624 if (longflag)
1625 numprinted = sprintf(numberresult, fmt,
1626 va_arg(count, unsigned long));
1627#ifdef HAVE_LONG_LONG
1628 else if (longlongflag)
1629 numprinted = sprintf(numberresult, fmt,
1630 va_arg(count, unsigned PY_LONG_LONG));
1631#endif
1632 else if (size_tflag)
1633 numprinted = sprintf(numberresult, fmt,
1634 va_arg(count, size_t));
1635 else
1636 numprinted = sprintf(numberresult, fmt,
1637 va_arg(count, unsigned int));
1638 n += numprinted;
1639 numberresult += (numprinted + 1);
1640 assert(*(numberresult - 1) == '\0');
1641 assert(*(numberresult - 2) != '\0');
1642 assert(numprinted >= 0);
1643 assert(numberresult <= numberresults + numbersize);
1644 break;
1645 case 'x':
1646 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1647 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1648 n += numprinted;
1649 numberresult += (numprinted + 1);
1650 assert(*(numberresult - 1) == '\0');
1651 assert(*(numberresult - 2) != '\0');
1652 assert(numprinted >= 0);
1653 assert(numberresult <= numberresults + numbersize);
1654 break;
1655 case 'p':
1656 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1657 /* %p is ill-defined: ensure leading 0x. */
1658 if (numberresult[1] == 'X')
1659 numberresult[1] = 'x';
1660 else if (numberresult[1] != 'x') {
1661 memmove(numberresult + 2, numberresult,
1662 strlen(numberresult) + 1);
1663 numberresult[0] = '0';
1664 numberresult[1] = 'x';
1665 numprinted += 2;
1666 }
1667 n += numprinted;
1668 numberresult += (numprinted + 1);
1669 assert(*(numberresult - 1) == '\0');
1670 assert(*(numberresult - 2) != '\0');
1671 assert(numprinted >= 0);
1672 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001673 break;
1674 case 's':
1675 {
1676 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001677 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001678 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1679 if (!str)
1680 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 /* since PyUnicode_DecodeUTF8 returns already flexible
1682 unicode objects, there is no need to call ready on them */
1683 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001684 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001686 /* Remember the str and switch to the next slot */
1687 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001688 break;
1689 }
1690 case 'U':
1691 {
1692 PyObject *obj = va_arg(count, PyObject *);
1693 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 if (PyUnicode_READY(obj) == -1)
1695 goto fail;
1696 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001697 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001699 break;
1700 }
1701 case 'V':
1702 {
1703 PyObject *obj = va_arg(count, PyObject *);
1704 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001705 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001706 assert(obj || str);
1707 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001708 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 if (PyUnicode_READY(obj) == -1)
1710 goto fail;
1711 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001712 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001714 *callresult++ = NULL;
1715 }
1716 else {
1717 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1718 if (!str_obj)
1719 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001721 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001723 *callresult++ = str_obj;
1724 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001725 break;
1726 }
1727 case 'S':
1728 {
1729 PyObject *obj = va_arg(count, PyObject *);
1730 PyObject *str;
1731 assert(obj);
1732 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001734 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001736 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001738 /* Remember the str and switch to the next slot */
1739 *callresult++ = str;
1740 break;
1741 }
1742 case 'R':
1743 {
1744 PyObject *obj = va_arg(count, PyObject *);
1745 PyObject *repr;
1746 assert(obj);
1747 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001749 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001751 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001753 /* Remember the repr and switch to the next slot */
1754 *callresult++ = repr;
1755 break;
1756 }
1757 case 'A':
1758 {
1759 PyObject *obj = va_arg(count, PyObject *);
1760 PyObject *ascii;
1761 assert(obj);
1762 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001764 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001766 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001768 /* Remember the repr and switch to the next slot */
1769 *callresult++ = ascii;
1770 break;
1771 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001772 default:
1773 /* if we stumble upon an unknown
1774 formatting code, copy the rest of
1775 the format string to the output
1776 string. (we cannot just skip the
1777 code, since there's no way to know
1778 what's in the argument list) */
1779 n += strlen(p);
1780 goto expand;
1781 }
1782 } else
1783 n++;
1784 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001786 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001788 we don't have to resize the string.
1789 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001791 if (!string)
1792 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 kind = PyUnicode_KIND(string);
1794 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001795 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001799 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001800 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001801
1802 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1804 /* checking for == because the last argument could be a empty
1805 string, which causes i to point to end, the assert at the end of
1806 the loop */
1807 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001808
Benjamin Peterson14339b62009-01-31 16:36:08 +00001809 switch (*f) {
1810 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001811 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 const int ordinal = va_arg(vargs, int);
1813 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001814 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001815 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001816 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001817 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001818 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001819 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 case 'p':
1821 /* unused, since we already have the result */
1822 if (*f == 'p')
1823 (void) va_arg(vargs, void *);
1824 else
1825 (void) va_arg(vargs, int);
1826 /* extract the result from numberresults and append. */
1827 for (; *numberresult; ++i, ++numberresult)
1828 PyUnicode_WRITE(kind, data, i, *numberresult);
1829 /* skip over the separating '\0' */
1830 assert(*numberresult == '\0');
1831 numberresult++;
1832 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001833 break;
1834 case 's':
1835 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001836 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001838 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 size = PyUnicode_GET_LENGTH(*callresult);
1840 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001841 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1842 *callresult, 0,
1843 size) < 0)
1844 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001846 /* We're done with the unicode()/repr() => forget it */
1847 Py_DECREF(*callresult);
1848 /* switch to next unicode()/repr() result */
1849 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001850 break;
1851 }
1852 case 'U':
1853 {
1854 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 Py_ssize_t size;
1856 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1857 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001858 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1859 obj, 0,
1860 size) < 0)
1861 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001863 break;
1864 }
1865 case 'V':
1866 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001868 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001869 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001870 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871 size = PyUnicode_GET_LENGTH(obj);
1872 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001873 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1874 obj, 0,
1875 size) < 0)
1876 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001877 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001878 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879 size = PyUnicode_GET_LENGTH(*callresult);
1880 assert(PyUnicode_KIND(*callresult) <=
1881 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001882 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1883 *callresult,
1884 0, size) < 0)
1885 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001887 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001888 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001889 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001890 break;
1891 }
1892 case 'S':
1893 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001894 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001895 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001896 /* unused, since we already have the result */
1897 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001899 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1900 *callresult, 0,
1901 PyUnicode_GET_LENGTH(*callresult)) < 0)
1902 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001904 /* We're done with the unicode()/repr() => forget it */
1905 Py_DECREF(*callresult);
1906 /* switch to next unicode()/repr() result */
1907 ++callresult;
1908 break;
1909 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001910 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001912 break;
1913 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 for (; *p; ++p, ++i)
1915 PyUnicode_WRITE(kind, data, i, *p);
1916 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001917 goto end;
1918 }
Victor Stinner1205f272010-09-11 00:54:47 +00001919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920 else {
1921 assert(i < PyUnicode_GET_LENGTH(string));
1922 PyUnicode_WRITE(kind, data, i++, *f);
1923 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001925 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001926
Benjamin Peterson29060642009-01-31 22:14:21 +00001927 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001928 if (callresults)
1929 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 if (numberresults)
1931 PyObject_Free(numberresults);
1932 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001933 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001934 if (callresults) {
1935 PyObject **callresult2 = callresults;
1936 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001937 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001938 ++callresult2;
1939 }
1940 PyObject_Free(callresults);
1941 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001942 if (numberresults)
1943 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001944 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001945}
1946
Walter Dörwaldd2034312007-05-18 16:29:38 +00001947PyObject *
1948PyUnicode_FromFormat(const char *format, ...)
1949{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001950 PyObject* ret;
1951 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001952
1953#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001954 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001955#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001956 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001957#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001958 ret = PyUnicode_FromFormatV(format, vargs);
1959 va_end(vargs);
1960 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001961}
1962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963#ifdef HAVE_WCHAR_H
1964
Victor Stinner5593d8a2010-10-02 11:11:27 +00001965/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1966 convert a Unicode object to a wide character string.
1967
Victor Stinnerd88d9832011-09-06 02:00:05 +02001968 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001969 character) required to convert the unicode object. Ignore size argument.
1970
Victor Stinnerd88d9832011-09-06 02:00:05 +02001971 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001972 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001973 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001974static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001975unicode_aswidechar(PyUnicodeObject *unicode,
1976 wchar_t *w,
1977 Py_ssize_t size)
1978{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001979 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980 const wchar_t *wstr;
1981
1982 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1983 if (wstr == NULL)
1984 return -1;
1985
Victor Stinner5593d8a2010-10-02 11:11:27 +00001986 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001987 if (size > res)
1988 size = res + 1;
1989 else
1990 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001992 return res;
1993 }
1994 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001996}
1997
1998Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001999PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002000 wchar_t *w,
2001 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002002{
2003 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002004 PyErr_BadInternalCall();
2005 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002007 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008}
2009
Victor Stinner137c34c2010-09-29 10:25:54 +00002010wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002011PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002012 Py_ssize_t *size)
2013{
2014 wchar_t* buffer;
2015 Py_ssize_t buflen;
2016
2017 if (unicode == NULL) {
2018 PyErr_BadInternalCall();
2019 return NULL;
2020 }
2021
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002022 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 if (buflen == -1)
2024 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002025 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002026 PyErr_NoMemory();
2027 return NULL;
2028 }
2029
Victor Stinner137c34c2010-09-29 10:25:54 +00002030 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2031 if (buffer == NULL) {
2032 PyErr_NoMemory();
2033 return NULL;
2034 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002035 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 if (buflen == -1)
2037 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002038 if (size != NULL)
2039 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002040 return buffer;
2041}
2042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044
Alexander Belopolsky40018472011-02-26 01:02:56 +00002045PyObject *
2046PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002047{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002049 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002050 PyErr_SetString(PyExc_ValueError,
2051 "chr() arg not in range(0x110000)");
2052 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002053 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 if (ordinal < 256)
2056 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058 v = PyUnicode_New(1, ordinal);
2059 if (v == NULL)
2060 return NULL;
2061 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2062 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002063}
2064
Alexander Belopolsky40018472011-02-26 01:02:56 +00002065PyObject *
2066PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002068 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002069 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002070 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002071 if (PyUnicode_READY(obj))
2072 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002073 Py_INCREF(obj);
2074 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002075 }
2076 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002077 /* For a Unicode subtype that's not a Unicode object,
2078 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002079 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002080 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002081 PyErr_Format(PyExc_TypeError,
2082 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002083 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002084 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002085}
2086
Alexander Belopolsky40018472011-02-26 01:02:56 +00002087PyObject *
2088PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002089 const char *encoding,
2090 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002091{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002092 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002093 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002094
Guido van Rossumd57fd912000-03-10 22:53:23 +00002095 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002096 PyErr_BadInternalCall();
2097 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002099
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002100 /* Decoding bytes objects is the most common case and should be fast */
2101 if (PyBytes_Check(obj)) {
2102 if (PyBytes_GET_SIZE(obj) == 0) {
2103 Py_INCREF(unicode_empty);
2104 v = (PyObject *) unicode_empty;
2105 }
2106 else {
2107 v = PyUnicode_Decode(
2108 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2109 encoding, errors);
2110 }
2111 return v;
2112 }
2113
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002114 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002115 PyErr_SetString(PyExc_TypeError,
2116 "decoding str is not supported");
2117 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002118 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002119
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002120 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2121 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2122 PyErr_Format(PyExc_TypeError,
2123 "coercing to str: need bytes, bytearray "
2124 "or buffer-like object, %.80s found",
2125 Py_TYPE(obj)->tp_name);
2126 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002127 }
Tim Petersced69f82003-09-16 20:30:58 +00002128
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002129 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002130 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002131 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 }
Tim Petersced69f82003-09-16 20:30:58 +00002133 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002134 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002135
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002136 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002137 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138}
2139
Victor Stinner600d3be2010-06-10 12:00:55 +00002140/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002141 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2142 1 on success. */
2143static int
2144normalize_encoding(const char *encoding,
2145 char *lower,
2146 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002148 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002149 char *l;
2150 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002151
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002152 e = encoding;
2153 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002154 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002155 while (*e) {
2156 if (l == l_end)
2157 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002158 if (Py_ISUPPER(*e)) {
2159 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002160 }
2161 else if (*e == '_') {
2162 *l++ = '-';
2163 e++;
2164 }
2165 else {
2166 *l++ = *e++;
2167 }
2168 }
2169 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002170 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002171}
2172
Alexander Belopolsky40018472011-02-26 01:02:56 +00002173PyObject *
2174PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002175 Py_ssize_t size,
2176 const char *encoding,
2177 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002178{
2179 PyObject *buffer = NULL, *unicode;
2180 Py_buffer info;
2181 char lower[11]; /* Enough for any encoding shortcut */
2182
2183 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002184 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002185
2186 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002187 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002188 if ((strcmp(lower, "utf-8") == 0) ||
2189 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002190 return PyUnicode_DecodeUTF8(s, size, errors);
2191 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002192 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002193 (strcmp(lower, "iso-8859-1") == 0))
2194 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002195#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002196 else if (strcmp(lower, "mbcs") == 0)
2197 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002198#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002199 else if (strcmp(lower, "ascii") == 0)
2200 return PyUnicode_DecodeASCII(s, size, errors);
2201 else if (strcmp(lower, "utf-16") == 0)
2202 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2203 else if (strcmp(lower, "utf-32") == 0)
2204 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206
2207 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002208 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002209 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002210 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002211 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212 if (buffer == NULL)
2213 goto onError;
2214 unicode = PyCodec_Decode(buffer, encoding, errors);
2215 if (unicode == NULL)
2216 goto onError;
2217 if (!PyUnicode_Check(unicode)) {
2218 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002219 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002220 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 Py_DECREF(unicode);
2222 goto onError;
2223 }
2224 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 if (PyUnicode_READY(unicode)) {
2226 Py_DECREF(unicode);
2227 return NULL;
2228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002230
Benjamin Peterson29060642009-01-31 22:14:21 +00002231 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 Py_XDECREF(buffer);
2233 return NULL;
2234}
2235
Alexander Belopolsky40018472011-02-26 01:02:56 +00002236PyObject *
2237PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002238 const char *encoding,
2239 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002240{
2241 PyObject *v;
2242
2243 if (!PyUnicode_Check(unicode)) {
2244 PyErr_BadArgument();
2245 goto onError;
2246 }
2247
2248 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002249 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002250
2251 /* Decode via the codec registry */
2252 v = PyCodec_Decode(unicode, encoding, errors);
2253 if (v == NULL)
2254 goto onError;
2255 return v;
2256
Benjamin Peterson29060642009-01-31 22:14:21 +00002257 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002258 return NULL;
2259}
2260
Alexander Belopolsky40018472011-02-26 01:02:56 +00002261PyObject *
2262PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002263 const char *encoding,
2264 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002265{
2266 PyObject *v;
2267
2268 if (!PyUnicode_Check(unicode)) {
2269 PyErr_BadArgument();
2270 goto onError;
2271 }
2272
2273 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002274 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002275
2276 /* Decode via the codec registry */
2277 v = PyCodec_Decode(unicode, encoding, errors);
2278 if (v == NULL)
2279 goto onError;
2280 if (!PyUnicode_Check(v)) {
2281 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002282 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002283 Py_TYPE(v)->tp_name);
2284 Py_DECREF(v);
2285 goto onError;
2286 }
2287 return v;
2288
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002290 return NULL;
2291}
2292
Alexander Belopolsky40018472011-02-26 01:02:56 +00002293PyObject *
2294PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002295 Py_ssize_t size,
2296 const char *encoding,
2297 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298{
2299 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002300
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 unicode = PyUnicode_FromUnicode(s, size);
2302 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2305 Py_DECREF(unicode);
2306 return v;
2307}
2308
Alexander Belopolsky40018472011-02-26 01:02:56 +00002309PyObject *
2310PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002311 const char *encoding,
2312 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002313{
2314 PyObject *v;
2315
2316 if (!PyUnicode_Check(unicode)) {
2317 PyErr_BadArgument();
2318 goto onError;
2319 }
2320
2321 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002322 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002323
2324 /* Encode via the codec registry */
2325 v = PyCodec_Encode(unicode, encoding, errors);
2326 if (v == NULL)
2327 goto onError;
2328 return v;
2329
Benjamin Peterson29060642009-01-31 22:14:21 +00002330 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002331 return NULL;
2332}
2333
Victor Stinnerad158722010-10-27 00:25:46 +00002334PyObject *
2335PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002336{
Victor Stinner99b95382011-07-04 14:23:54 +02002337#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002338 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2339 PyUnicode_GET_SIZE(unicode),
2340 NULL);
2341#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002343#else
Victor Stinner793b5312011-04-27 00:24:21 +02002344 PyInterpreterState *interp = PyThreadState_GET()->interp;
2345 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2346 cannot use it to encode and decode filenames before it is loaded. Load
2347 the Python codec requires to encode at least its own filename. Use the C
2348 version of the locale codec until the codec registry is initialized and
2349 the Python codec is loaded.
2350
2351 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2352 cannot only rely on it: check also interp->fscodec_initialized for
2353 subinterpreters. */
2354 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002355 return PyUnicode_AsEncodedString(unicode,
2356 Py_FileSystemDefaultEncoding,
2357 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002358 }
2359 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002360 /* locale encoding with surrogateescape */
2361 wchar_t *wchar;
2362 char *bytes;
2363 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002364 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002365
2366 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2367 if (wchar == NULL)
2368 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002369 bytes = _Py_wchar2char(wchar, &error_pos);
2370 if (bytes == NULL) {
2371 if (error_pos != (size_t)-1) {
2372 char *errmsg = strerror(errno);
2373 PyObject *exc = NULL;
2374 if (errmsg == NULL)
2375 errmsg = "Py_wchar2char() failed";
2376 raise_encode_exception(&exc,
2377 "filesystemencoding",
2378 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2379 error_pos, error_pos+1,
2380 errmsg);
2381 Py_XDECREF(exc);
2382 }
2383 else
2384 PyErr_NoMemory();
2385 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002386 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002387 }
2388 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002389
2390 bytes_obj = PyBytes_FromString(bytes);
2391 PyMem_Free(bytes);
2392 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002393 }
Victor Stinnerad158722010-10-27 00:25:46 +00002394#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002395}
2396
Alexander Belopolsky40018472011-02-26 01:02:56 +00002397PyObject *
2398PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002399 const char *encoding,
2400 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002401{
2402 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002403 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002404
Guido van Rossumd57fd912000-03-10 22:53:23 +00002405 if (!PyUnicode_Check(unicode)) {
2406 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002408 }
Fred Drakee4315f52000-05-09 19:53:39 +00002409
Victor Stinner2f283c22011-03-02 01:21:46 +00002410 if (encoding == NULL) {
2411 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002413 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002415 }
Fred Drakee4315f52000-05-09 19:53:39 +00002416
2417 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002418 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002419 if ((strcmp(lower, "utf-8") == 0) ||
2420 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002421 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002422 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002424 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002425 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002426 }
Victor Stinner37296e82010-06-10 13:36:23 +00002427 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002428 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002429 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002430 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002431#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002432 else if (strcmp(lower, "mbcs") == 0)
2433 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2434 PyUnicode_GET_SIZE(unicode),
2435 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002436#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002437 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002440
2441 /* Encode via the codec registry */
2442 v = PyCodec_Encode(unicode, encoding, errors);
2443 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002444 return NULL;
2445
2446 /* The normal path */
2447 if (PyBytes_Check(v))
2448 return v;
2449
2450 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002451 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002452 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002453 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002454
2455 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2456 "encoder %s returned bytearray instead of bytes",
2457 encoding);
2458 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002459 Py_DECREF(v);
2460 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002461 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002462
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002463 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2464 Py_DECREF(v);
2465 return b;
2466 }
2467
2468 PyErr_Format(PyExc_TypeError,
2469 "encoder did not return a bytes object (type=%.400s)",
2470 Py_TYPE(v)->tp_name);
2471 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002472 return NULL;
2473}
2474
Alexander Belopolsky40018472011-02-26 01:02:56 +00002475PyObject *
2476PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002477 const char *encoding,
2478 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002479{
2480 PyObject *v;
2481
2482 if (!PyUnicode_Check(unicode)) {
2483 PyErr_BadArgument();
2484 goto onError;
2485 }
2486
2487 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002488 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002489
2490 /* Encode via the codec registry */
2491 v = PyCodec_Encode(unicode, encoding, errors);
2492 if (v == NULL)
2493 goto onError;
2494 if (!PyUnicode_Check(v)) {
2495 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002496 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002497 Py_TYPE(v)->tp_name);
2498 Py_DECREF(v);
2499 goto onError;
2500 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002502
Benjamin Peterson29060642009-01-31 22:14:21 +00002503 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 return NULL;
2505}
2506
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002507PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002508PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002509 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002510 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2511}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002512
Christian Heimes5894ba72007-11-04 11:43:14 +00002513PyObject*
2514PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2515{
Victor Stinner99b95382011-07-04 14:23:54 +02002516#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002517 return PyUnicode_DecodeMBCS(s, size, NULL);
2518#elif defined(__APPLE__)
2519 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2520#else
Victor Stinner793b5312011-04-27 00:24:21 +02002521 PyInterpreterState *interp = PyThreadState_GET()->interp;
2522 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2523 cannot use it to encode and decode filenames before it is loaded. Load
2524 the Python codec requires to encode at least its own filename. Use the C
2525 version of the locale codec until the codec registry is initialized and
2526 the Python codec is loaded.
2527
2528 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2529 cannot only rely on it: check also interp->fscodec_initialized for
2530 subinterpreters. */
2531 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002532 return PyUnicode_Decode(s, size,
2533 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002534 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002535 }
2536 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002537 /* locale encoding with surrogateescape */
2538 wchar_t *wchar;
2539 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002540 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002541
2542 if (s[size] != '\0' || size != strlen(s)) {
2543 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2544 return NULL;
2545 }
2546
Victor Stinner168e1172010-10-16 23:16:16 +00002547 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002548 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002549 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002550
Victor Stinner168e1172010-10-16 23:16:16 +00002551 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002552 PyMem_Free(wchar);
2553 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002554 }
Victor Stinnerad158722010-10-27 00:25:46 +00002555#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002556}
2557
Martin v. Löwis011e8422009-05-05 04:43:17 +00002558
2559int
2560PyUnicode_FSConverter(PyObject* arg, void* addr)
2561{
2562 PyObject *output = NULL;
2563 Py_ssize_t size;
2564 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002565 if (arg == NULL) {
2566 Py_DECREF(*(PyObject**)addr);
2567 return 1;
2568 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002569 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002570 output = arg;
2571 Py_INCREF(output);
2572 }
2573 else {
2574 arg = PyUnicode_FromObject(arg);
2575 if (!arg)
2576 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002577 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002578 Py_DECREF(arg);
2579 if (!output)
2580 return 0;
2581 if (!PyBytes_Check(output)) {
2582 Py_DECREF(output);
2583 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2584 return 0;
2585 }
2586 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002587 size = PyBytes_GET_SIZE(output);
2588 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002589 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002590 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002591 Py_DECREF(output);
2592 return 0;
2593 }
2594 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002595 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002596}
2597
2598
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002599int
2600PyUnicode_FSDecoder(PyObject* arg, void* addr)
2601{
2602 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002603 if (arg == NULL) {
2604 Py_DECREF(*(PyObject**)addr);
2605 return 1;
2606 }
2607 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 if (PyUnicode_READY(arg))
2609 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002610 output = arg;
2611 Py_INCREF(output);
2612 }
2613 else {
2614 arg = PyBytes_FromObject(arg);
2615 if (!arg)
2616 return 0;
2617 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2618 PyBytes_GET_SIZE(arg));
2619 Py_DECREF(arg);
2620 if (!output)
2621 return 0;
2622 if (!PyUnicode_Check(output)) {
2623 Py_DECREF(output);
2624 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2625 return 0;
2626 }
2627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2629 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002630 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2631 Py_DECREF(output);
2632 return 0;
2633 }
2634 *(PyObject**)addr = output;
2635 return Py_CLEANUP_SUPPORTED;
2636}
2637
2638
Martin v. Löwis5b222132007-06-10 09:51:05 +00002639char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002641{
Christian Heimesf3863112007-11-22 07:46:41 +00002642 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002643 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2644
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002645 if (!PyUnicode_Check(unicode)) {
2646 PyErr_BadArgument();
2647 return NULL;
2648 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002649 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002650 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651
2652 if (_PyUnicode_UTF8(unicode) == NULL) {
2653 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2654 if (bytes == NULL)
2655 return NULL;
2656 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2657 if (u->_base.utf8 == NULL) {
2658 Py_DECREF(bytes);
2659 return NULL;
2660 }
2661 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2662 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2663 Py_DECREF(bytes);
2664 }
2665
2666 if (psize)
2667 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2668 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002669}
2670
2671char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002673{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2675}
2676
2677#ifdef Py_DEBUG
2678int unicode_as_unicode_calls = 0;
2679#endif
2680
2681
2682Py_UNICODE *
2683PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2684{
2685 PyUnicodeObject *u;
2686 const unsigned char *one_byte;
2687#if SIZEOF_WCHAR_T == 4
2688 const Py_UCS2 *two_bytes;
2689#else
2690 const Py_UCS4 *four_bytes;
2691 const Py_UCS4 *ucs4_end;
2692 Py_ssize_t num_surrogates;
2693#endif
2694 wchar_t *w;
2695 wchar_t *wchar_end;
2696
2697 if (!PyUnicode_Check(unicode)) {
2698 PyErr_BadArgument();
2699 return NULL;
2700 }
2701 u = (PyUnicodeObject*)unicode;
2702 if (_PyUnicode_WSTR(u) == NULL) {
2703 /* Non-ASCII compact unicode object */
2704 assert(_PyUnicode_KIND(u) != 0);
2705 assert(PyUnicode_IS_READY(u));
2706
2707#ifdef Py_DEBUG
2708 ++unicode_as_unicode_calls;
2709#endif
2710
2711 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2712#if SIZEOF_WCHAR_T == 2
2713 four_bytes = PyUnicode_4BYTE_DATA(u);
2714 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2715 num_surrogates = 0;
2716
2717 for (; four_bytes < ucs4_end; ++four_bytes) {
2718 if (*four_bytes > 0xFFFF)
2719 ++num_surrogates;
2720 }
2721
2722 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2723 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2724 if (!_PyUnicode_WSTR(u)) {
2725 PyErr_NoMemory();
2726 return NULL;
2727 }
2728 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2729
2730 w = _PyUnicode_WSTR(u);
2731 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2732 four_bytes = PyUnicode_4BYTE_DATA(u);
2733 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2734 if (*four_bytes > 0xFFFF) {
2735 /* encode surrogate pair in this case */
2736 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2737 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2738 }
2739 else
2740 *w = *four_bytes;
2741
2742 if (w > wchar_end) {
2743 assert(0 && "Miscalculated string end");
2744 }
2745 }
2746 *w = 0;
2747#else
2748 /* sizeof(wchar_t) == 4 */
2749 Py_FatalError("Impossible unicode object state, wstr and str "
2750 "should share memory already.");
2751 return NULL;
2752#endif
2753 }
2754 else {
2755 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2756 (_PyUnicode_LENGTH(u) + 1));
2757 if (!_PyUnicode_WSTR(u)) {
2758 PyErr_NoMemory();
2759 return NULL;
2760 }
2761 if (!PyUnicode_IS_COMPACT_ASCII(u))
2762 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2763 w = _PyUnicode_WSTR(u);
2764 wchar_end = w + _PyUnicode_LENGTH(u);
2765
2766 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2767 one_byte = PyUnicode_1BYTE_DATA(u);
2768 for (; w < wchar_end; ++one_byte, ++w)
2769 *w = *one_byte;
2770 /* null-terminate the wstr */
2771 *w = 0;
2772 }
2773 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2774#if SIZEOF_WCHAR_T == 4
2775 two_bytes = PyUnicode_2BYTE_DATA(u);
2776 for (; w < wchar_end; ++two_bytes, ++w)
2777 *w = *two_bytes;
2778 /* null-terminate the wstr */
2779 *w = 0;
2780#else
2781 /* sizeof(wchar_t) == 2 */
2782 PyObject_FREE(_PyUnicode_WSTR(u));
2783 _PyUnicode_WSTR(u) = NULL;
2784 Py_FatalError("Impossible unicode object state, wstr "
2785 "and str should share memory already.");
2786 return NULL;
2787#endif
2788 }
2789 else {
2790 assert(0 && "This should never happen.");
2791 }
2792 }
2793 }
2794 if (size != NULL)
2795 *size = PyUnicode_WSTR_LENGTH(u);
2796 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002797}
2798
Alexander Belopolsky40018472011-02-26 01:02:56 +00002799Py_UNICODE *
2800PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803}
2804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002805
Alexander Belopolsky40018472011-02-26 01:02:56 +00002806Py_ssize_t
2807PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808{
2809 if (!PyUnicode_Check(unicode)) {
2810 PyErr_BadArgument();
2811 goto onError;
2812 }
2813 return PyUnicode_GET_SIZE(unicode);
2814
Benjamin Peterson29060642009-01-31 22:14:21 +00002815 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 return -1;
2817}
2818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819Py_ssize_t
2820PyUnicode_GetLength(PyObject *unicode)
2821{
2822 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2823 PyErr_BadArgument();
2824 return -1;
2825 }
2826
2827 return PyUnicode_GET_LENGTH(unicode);
2828}
2829
2830Py_UCS4
2831PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2832{
2833 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2834 return PyErr_BadArgument();
2835 return (Py_UCS4)-1;
2836 }
2837 return PyUnicode_READ_CHAR(unicode, index);
2838}
2839
2840int
2841PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2842{
2843 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2844 return PyErr_BadArgument();
2845 return -1;
2846 }
2847
2848 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2849 index, ch);
2850 return 0;
2851}
2852
Alexander Belopolsky40018472011-02-26 01:02:56 +00002853const char *
2854PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002855{
Victor Stinner42cb4622010-09-01 19:39:01 +00002856 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002857}
2858
Victor Stinner554f3f02010-06-16 23:33:54 +00002859/* create or adjust a UnicodeDecodeError */
2860static void
2861make_decode_exception(PyObject **exceptionObject,
2862 const char *encoding,
2863 const char *input, Py_ssize_t length,
2864 Py_ssize_t startpos, Py_ssize_t endpos,
2865 const char *reason)
2866{
2867 if (*exceptionObject == NULL) {
2868 *exceptionObject = PyUnicodeDecodeError_Create(
2869 encoding, input, length, startpos, endpos, reason);
2870 }
2871 else {
2872 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2873 goto onError;
2874 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2875 goto onError;
2876 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2877 goto onError;
2878 }
2879 return;
2880
2881onError:
2882 Py_DECREF(*exceptionObject);
2883 *exceptionObject = NULL;
2884}
2885
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002886/* error handling callback helper:
2887 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002888 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002889 and adjust various state variables.
2890 return 0 on success, -1 on error
2891*/
2892
Alexander Belopolsky40018472011-02-26 01:02:56 +00002893static int
2894unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002895 const char *encoding, const char *reason,
2896 const char **input, const char **inend, Py_ssize_t *startinpos,
2897 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2898 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002899{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002900 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002901
2902 PyObject *restuple = NULL;
2903 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002904 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002905 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002906 Py_ssize_t requiredsize;
2907 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002908 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002909 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002910 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002911 int res = -1;
2912
2913 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002914 *errorHandler = PyCodec_LookupError(errors);
2915 if (*errorHandler == NULL)
2916 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002917 }
2918
Victor Stinner554f3f02010-06-16 23:33:54 +00002919 make_decode_exception(exceptionObject,
2920 encoding,
2921 *input, *inend - *input,
2922 *startinpos, *endinpos,
2923 reason);
2924 if (*exceptionObject == NULL)
2925 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002926
2927 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2928 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002929 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002930 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002931 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002932 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002933 }
2934 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002935 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002936
2937 /* Copy back the bytes variables, which might have been modified by the
2938 callback */
2939 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2940 if (!inputobj)
2941 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002942 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002943 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002944 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002945 *input = PyBytes_AS_STRING(inputobj);
2946 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002947 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002948 /* we can DECREF safely, as the exception has another reference,
2949 so the object won't go away. */
2950 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002951
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002952 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002953 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002954 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002955 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2956 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002957 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002958
2959 /* need more space? (at least enough for what we
2960 have+the replacement+the rest of the string (starting
2961 at the new input position), so we won't have to check space
2962 when there are no errors in the rest of the string) */
2963 repptr = PyUnicode_AS_UNICODE(repunicode);
2964 repsize = PyUnicode_GET_SIZE(repunicode);
2965 requiredsize = *outpos + repsize + insize-newpos;
2966 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002967 if (requiredsize<2*outsize)
2968 requiredsize = 2*outsize;
2969 if (_PyUnicode_Resize(output, requiredsize) < 0)
2970 goto onError;
2971 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972 }
2973 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002974 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002975 Py_UNICODE_COPY(*outptr, repptr, repsize);
2976 *outptr += repsize;
2977 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002978
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979 /* we made it! */
2980 res = 0;
2981
Benjamin Peterson29060642009-01-31 22:14:21 +00002982 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002983 Py_XDECREF(restuple);
2984 return res;
2985}
2986
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002987/* --- UTF-7 Codec -------------------------------------------------------- */
2988
Antoine Pitrou244651a2009-05-04 18:56:13 +00002989/* See RFC2152 for details. We encode conservatively and decode liberally. */
2990
2991/* Three simple macros defining base-64. */
2992
2993/* Is c a base-64 character? */
2994
2995#define IS_BASE64(c) \
2996 (((c) >= 'A' && (c) <= 'Z') || \
2997 ((c) >= 'a' && (c) <= 'z') || \
2998 ((c) >= '0' && (c) <= '9') || \
2999 (c) == '+' || (c) == '/')
3000
3001/* given that c is a base-64 character, what is its base-64 value? */
3002
3003#define FROM_BASE64(c) \
3004 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3005 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3006 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3007 (c) == '+' ? 62 : 63)
3008
3009/* What is the base-64 character of the bottom 6 bits of n? */
3010
3011#define TO_BASE64(n) \
3012 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3013
3014/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3015 * decoded as itself. We are permissive on decoding; the only ASCII
3016 * byte not decoding to itself is the + which begins a base64
3017 * string. */
3018
3019#define DECODE_DIRECT(c) \
3020 ((c) <= 127 && (c) != '+')
3021
3022/* The UTF-7 encoder treats ASCII characters differently according to
3023 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3024 * the above). See RFC2152. This array identifies these different
3025 * sets:
3026 * 0 : "Set D"
3027 * alphanumeric and '(),-./:?
3028 * 1 : "Set O"
3029 * !"#$%&*;<=>@[]^_`{|}
3030 * 2 : "whitespace"
3031 * ht nl cr sp
3032 * 3 : special (must be base64 encoded)
3033 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3034 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003035
Tim Petersced69f82003-09-16 20:30:58 +00003036static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003037char utf7_category[128] = {
3038/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3039 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3040/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3041 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3042/* sp ! " # $ % & ' ( ) * + , - . / */
3043 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3044/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3045 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3046/* @ A B C D E F G H I J K L M N O */
3047 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3048/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3049 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3050/* ` a b c d e f g h i j k l m n o */
3051 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3052/* p q r s t u v w x y z { | } ~ del */
3053 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003054};
3055
Antoine Pitrou244651a2009-05-04 18:56:13 +00003056/* ENCODE_DIRECT: this character should be encoded as itself. The
3057 * answer depends on whether we are encoding set O as itself, and also
3058 * on whether we are encoding whitespace as itself. RFC2152 makes it
3059 * clear that the answers to these questions vary between
3060 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003061
Antoine Pitrou244651a2009-05-04 18:56:13 +00003062#define ENCODE_DIRECT(c, directO, directWS) \
3063 ((c) < 128 && (c) > 0 && \
3064 ((utf7_category[(c)] == 0) || \
3065 (directWS && (utf7_category[(c)] == 2)) || \
3066 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003067
Alexander Belopolsky40018472011-02-26 01:02:56 +00003068PyObject *
3069PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003070 Py_ssize_t size,
3071 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003072{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003073 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3074}
3075
Antoine Pitrou244651a2009-05-04 18:56:13 +00003076/* The decoder. The only state we preserve is our read position,
3077 * i.e. how many characters we have consumed. So if we end in the
3078 * middle of a shift sequence we have to back off the read position
3079 * and the output to the beginning of the sequence, otherwise we lose
3080 * all the shift state (seen bits, number of bits seen, high
3081 * surrogate). */
3082
Alexander Belopolsky40018472011-02-26 01:02:56 +00003083PyObject *
3084PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003085 Py_ssize_t size,
3086 const char *errors,
3087 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003088{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003089 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003090 Py_ssize_t startinpos;
3091 Py_ssize_t endinpos;
3092 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003093 const char *e;
3094 PyUnicodeObject *unicode;
3095 Py_UNICODE *p;
3096 const char *errmsg = "";
3097 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003098 Py_UNICODE *shiftOutStart;
3099 unsigned int base64bits = 0;
3100 unsigned long base64buffer = 0;
3101 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003102 PyObject *errorHandler = NULL;
3103 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003104
3105 unicode = _PyUnicode_New(size);
3106 if (!unicode)
3107 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003108 if (size == 0) {
3109 if (consumed)
3110 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003111 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003112 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003114 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003115 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003116 e = s + size;
3117
3118 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003120 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003121 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003122
Antoine Pitrou244651a2009-05-04 18:56:13 +00003123 if (inShift) { /* in a base-64 section */
3124 if (IS_BASE64(ch)) { /* consume a base-64 character */
3125 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3126 base64bits += 6;
3127 s++;
3128 if (base64bits >= 16) {
3129 /* we have enough bits for a UTF-16 value */
3130 Py_UNICODE outCh = (Py_UNICODE)
3131 (base64buffer >> (base64bits-16));
3132 base64bits -= 16;
3133 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3134 if (surrogate) {
3135 /* expecting a second surrogate */
3136 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3137#ifdef Py_UNICODE_WIDE
3138 *p++ = (((surrogate & 0x3FF)<<10)
3139 | (outCh & 0x3FF)) + 0x10000;
3140#else
3141 *p++ = surrogate;
3142 *p++ = outCh;
3143#endif
3144 surrogate = 0;
3145 }
3146 else {
3147 surrogate = 0;
3148 errmsg = "second surrogate missing";
3149 goto utf7Error;
3150 }
3151 }
3152 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3153 /* first surrogate */
3154 surrogate = outCh;
3155 }
3156 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3157 errmsg = "unexpected second surrogate";
3158 goto utf7Error;
3159 }
3160 else {
3161 *p++ = outCh;
3162 }
3163 }
3164 }
3165 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003166 inShift = 0;
3167 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003168 if (surrogate) {
3169 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003170 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003171 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003172 if (base64bits > 0) { /* left-over bits */
3173 if (base64bits >= 6) {
3174 /* We've seen at least one base-64 character */
3175 errmsg = "partial character in shift sequence";
3176 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003177 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003178 else {
3179 /* Some bits remain; they should be zero */
3180 if (base64buffer != 0) {
3181 errmsg = "non-zero padding bits in shift sequence";
3182 goto utf7Error;
3183 }
3184 }
3185 }
3186 if (ch != '-') {
3187 /* '-' is absorbed; other terminating
3188 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003189 *p++ = ch;
3190 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003191 }
3192 }
3193 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003194 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003195 s++; /* consume '+' */
3196 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003197 s++;
3198 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003199 }
3200 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003201 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003202 shiftOutStart = p;
3203 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003204 }
3205 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003206 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003207 *p++ = ch;
3208 s++;
3209 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003210 else {
3211 startinpos = s-starts;
3212 s++;
3213 errmsg = "unexpected special character";
3214 goto utf7Error;
3215 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003216 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003217utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 outpos = p-PyUnicode_AS_UNICODE(unicode);
3219 endinpos = s-starts;
3220 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003221 errors, &errorHandler,
3222 "utf7", errmsg,
3223 &starts, &e, &startinpos, &endinpos, &exc, &s,
3224 &unicode, &outpos, &p))
3225 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003226 }
3227
Antoine Pitrou244651a2009-05-04 18:56:13 +00003228 /* end of string */
3229
3230 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3231 /* if we're in an inconsistent state, that's an error */
3232 if (surrogate ||
3233 (base64bits >= 6) ||
3234 (base64bits > 0 && base64buffer != 0)) {
3235 outpos = p-PyUnicode_AS_UNICODE(unicode);
3236 endinpos = size;
3237 if (unicode_decode_call_errorhandler(
3238 errors, &errorHandler,
3239 "utf7", "unterminated shift sequence",
3240 &starts, &e, &startinpos, &endinpos, &exc, &s,
3241 &unicode, &outpos, &p))
3242 goto onError;
3243 if (s < e)
3244 goto restart;
3245 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003246 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003247
3248 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003249 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003250 if (inShift) {
3251 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003252 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003253 }
3254 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003255 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003256 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003257 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003258
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003259 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003260 goto onError;
3261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 Py_XDECREF(errorHandler);
3263 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003264 if (PyUnicode_READY(unicode) == -1) {
3265 Py_DECREF(unicode);
3266 return NULL;
3267 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003268 return (PyObject *)unicode;
3269
Benjamin Peterson29060642009-01-31 22:14:21 +00003270 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 Py_XDECREF(errorHandler);
3272 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003273 Py_DECREF(unicode);
3274 return NULL;
3275}
3276
3277
Alexander Belopolsky40018472011-02-26 01:02:56 +00003278PyObject *
3279PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003280 Py_ssize_t size,
3281 int base64SetO,
3282 int base64WhiteSpace,
3283 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003284{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003285 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003286 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003287 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003288 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003289 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003290 unsigned int base64bits = 0;
3291 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003292 char * out;
3293 char * start;
3294
3295 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003296 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003297
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003298 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003299 return PyErr_NoMemory();
3300
Antoine Pitrou244651a2009-05-04 18:56:13 +00003301 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003302 if (v == NULL)
3303 return NULL;
3304
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003305 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003306 for (;i < size; ++i) {
3307 Py_UNICODE ch = s[i];
3308
Antoine Pitrou244651a2009-05-04 18:56:13 +00003309 if (inShift) {
3310 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3311 /* shifting out */
3312 if (base64bits) { /* output remaining bits */
3313 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3314 base64buffer = 0;
3315 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003316 }
3317 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003318 /* Characters not in the BASE64 set implicitly unshift the sequence
3319 so no '-' is required, except if the character is itself a '-' */
3320 if (IS_BASE64(ch) || ch == '-') {
3321 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003322 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003323 *out++ = (char) ch;
3324 }
3325 else {
3326 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003327 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003328 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003329 else { /* not in a shift sequence */
3330 if (ch == '+') {
3331 *out++ = '+';
3332 *out++ = '-';
3333 }
3334 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3335 *out++ = (char) ch;
3336 }
3337 else {
3338 *out++ = '+';
3339 inShift = 1;
3340 goto encode_char;
3341 }
3342 }
3343 continue;
3344encode_char:
3345#ifdef Py_UNICODE_WIDE
3346 if (ch >= 0x10000) {
3347 /* code first surrogate */
3348 base64bits += 16;
3349 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3350 while (base64bits >= 6) {
3351 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3352 base64bits -= 6;
3353 }
3354 /* prepare second surrogate */
3355 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3356 }
3357#endif
3358 base64bits += 16;
3359 base64buffer = (base64buffer << 16) | ch;
3360 while (base64bits >= 6) {
3361 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3362 base64bits -= 6;
3363 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003364 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003365 if (base64bits)
3366 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3367 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003368 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003369 if (_PyBytes_Resize(&v, out - start) < 0)
3370 return NULL;
3371 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003372}
3373
Antoine Pitrou244651a2009-05-04 18:56:13 +00003374#undef IS_BASE64
3375#undef FROM_BASE64
3376#undef TO_BASE64
3377#undef DECODE_DIRECT
3378#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003379
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380/* --- UTF-8 Codec -------------------------------------------------------- */
3381
Tim Petersced69f82003-09-16 20:30:58 +00003382static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003384 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3385 illegal prefix. See RFC 3629 for details */
3386 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3387 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003388 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3390 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3391 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3392 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003393 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3394 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003395 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3396 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003397 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3398 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3399 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3400 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3401 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402};
3403
Alexander Belopolsky40018472011-02-26 01:02:56 +00003404PyObject *
3405PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003406 Py_ssize_t size,
3407 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408{
Walter Dörwald69652032004-09-07 20:24:22 +00003409 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3410}
3411
Antoine Pitrouab868312009-01-10 15:40:25 +00003412/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3413#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3414
3415/* Mask to quickly check whether a C 'long' contains a
3416 non-ASCII, UTF8-encoded char. */
3417#if (SIZEOF_LONG == 8)
3418# define ASCII_CHAR_MASK 0x8080808080808080L
3419#elif (SIZEOF_LONG == 4)
3420# define ASCII_CHAR_MASK 0x80808080L
3421#else
3422# error C 'long' size should be either 4 or 8!
3423#endif
3424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003425/* Scans a UTF-8 string and returns the maximum character to be expected,
3426 the size of the decoded unicode string and if any major errors were
3427 encountered.
3428
3429 This function does check basic UTF-8 sanity, it does however NOT CHECK
3430 if the string contains surrogates, and if all continuation bytes are
3431 within the correct ranges, these checks are performed in
3432 PyUnicode_DecodeUTF8Stateful.
3433
3434 If it sets has_errors to 1, it means the value of unicode_size and max_char
3435 will be bogus and you should not rely on useful information in them.
3436 */
3437static Py_UCS4
3438utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3439 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3440 int *has_errors)
3441{
3442 Py_ssize_t n;
3443 Py_ssize_t char_count = 0;
3444 Py_UCS4 max_char = 127, new_max;
3445 Py_UCS4 upper_bound;
3446 const unsigned char *p = (const unsigned char *)s;
3447 const unsigned char *end = p + string_size;
3448 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3449 int err = 0;
3450
3451 for (; p < end && !err; ++p, ++char_count) {
3452 /* Only check value if it's not a ASCII char... */
3453 if (*p < 0x80) {
3454 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3455 an explanation. */
3456 if (!((size_t) p & LONG_PTR_MASK)) {
3457 /* Help register allocation */
3458 register const unsigned char *_p = p;
3459 while (_p < aligned_end) {
3460 unsigned long value = *(unsigned long *) _p;
3461 if (value & ASCII_CHAR_MASK)
3462 break;
3463 _p += SIZEOF_LONG;
3464 char_count += SIZEOF_LONG;
3465 }
3466 p = _p;
3467 if (p == end)
3468 break;
3469 }
3470 }
3471 if (*p >= 0x80) {
3472 n = utf8_code_length[*p];
3473 new_max = max_char;
3474 switch (n) {
3475 /* invalid start byte */
3476 case 0:
3477 err = 1;
3478 break;
3479 case 2:
3480 /* Code points between 0x00FF and 0x07FF inclusive.
3481 Approximate the upper bound of the code point,
3482 if this flips over 255 we can be sure it will be more
3483 than 255 and the string will need 2 bytes per code coint,
3484 if it stays under or equal to 255, we can be sure 1 byte
3485 is enough.
3486 ((*p & 0b00011111) << 6) | 0b00111111 */
3487 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3488 if (max_char < upper_bound)
3489 new_max = upper_bound;
3490 /* Ensure we track at least that we left ASCII space. */
3491 if (new_max < 128)
3492 new_max = 128;
3493 break;
3494 case 3:
3495 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3496 always > 255 and <= 65535 and will always need 2 bytes. */
3497 if (max_char < 65535)
3498 new_max = 65535;
3499 break;
3500 case 4:
3501 /* Code point will be above 0xFFFF for sure in this case. */
3502 new_max = 65537;
3503 break;
3504 /* Internal error, this should be caught by the first if */
3505 case 1:
3506 default:
3507 assert(0 && "Impossible case in utf8_max_char_and_size");
3508 err = 1;
3509 }
3510 /* Instead of number of overall bytes for this code point,
3511 n containts the number of following bytes: */
3512 --n;
3513 /* Check if the follow up chars are all valid continuation bytes */
3514 if (n >= 1) {
3515 const unsigned char *cont;
3516 if ((p + n) >= end) {
3517 if (consumed == 0)
3518 /* incomplete data, non-incremental decoding */
3519 err = 1;
3520 break;
3521 }
3522 for (cont = p + 1; cont < (p + n); ++cont) {
3523 if ((*cont & 0xc0) != 0x80) {
3524 err = 1;
3525 break;
3526 }
3527 }
3528 p += n;
3529 }
3530 else
3531 err = 1;
3532 max_char = new_max;
3533 }
3534 }
3535
3536 if (unicode_size)
3537 *unicode_size = char_count;
3538 if (has_errors)
3539 *has_errors = err;
3540 return max_char;
3541}
3542
3543/* Similar to PyUnicode_WRITE but can also write into wstr field
3544 of the legacy unicode representation */
3545#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3546 do { \
3547 const int k_ = (kind); \
3548 if (k_ == PyUnicode_WCHAR_KIND) \
3549 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3550 else if (k_ == PyUnicode_1BYTE_KIND) \
3551 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3552 else if (k_ == PyUnicode_2BYTE_KIND) \
3553 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3554 else \
3555 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3556 } while (0)
3557
Alexander Belopolsky40018472011-02-26 01:02:56 +00003558PyObject *
3559PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003560 Py_ssize_t size,
3561 const char *errors,
3562 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003563{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003566 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003567 Py_ssize_t startinpos;
3568 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003569 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003571 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 PyObject *errorHandler = NULL;
3573 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003574 Py_UCS4 maxchar = 0;
3575 Py_ssize_t unicode_size;
3576 Py_ssize_t i;
3577 int kind;
3578 void *data;
3579 int has_errors;
3580 Py_UNICODE *error_outptr;
3581#if SIZEOF_WCHAR_T == 2
3582 Py_ssize_t wchar_offset = 0;
3583#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584
Walter Dörwald69652032004-09-07 20:24:22 +00003585 if (size == 0) {
3586 if (consumed)
3587 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003588 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003590 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3591 consumed, &has_errors);
3592 if (has_errors) {
3593 unicode = _PyUnicode_New(size);
3594 if (!unicode)
3595 return NULL;
3596 kind = PyUnicode_WCHAR_KIND;
3597 data = PyUnicode_AS_UNICODE(unicode);
3598 assert(data != NULL);
3599 }
3600 else {
3601 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3602 if (!unicode)
3603 return NULL;
3604 /* When the string is ASCII only, just use memcpy and return.
3605 unicode_size may be != size if there is an incomplete UTF-8
3606 sequence at the end of the ASCII block. */
3607 if (maxchar < 128 && size == unicode_size) {
3608 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3609 return (PyObject *)unicode;
3610 }
3611 kind = PyUnicode_KIND(unicode);
3612 data = PyUnicode_DATA(unicode);
3613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003615 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003617 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618
3619 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003620 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621
3622 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003623 /* Fast path for runs of ASCII characters. Given that common UTF-8
3624 input will consist of an overwhelming majority of ASCII
3625 characters, we try to optimize for this case by checking
3626 as many characters as a C 'long' can contain.
3627 First, check if we can do an aligned read, as most CPUs have
3628 a penalty for unaligned reads.
3629 */
3630 if (!((size_t) s & LONG_PTR_MASK)) {
3631 /* Help register allocation */
3632 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003633 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003634 while (_s < aligned_end) {
3635 /* Read a whole long at a time (either 4 or 8 bytes),
3636 and do a fast unrolled copy if it only contains ASCII
3637 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003638 unsigned long value = *(unsigned long *) _s;
3639 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003640 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003641 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3642 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3643 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3644 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003645#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003646 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3647 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3648 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3649 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003650#endif
3651 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003652 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003653 }
3654 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003655 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003656 if (s == e)
3657 break;
3658 ch = (unsigned char)*s;
3659 }
3660 }
3661
3662 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003663 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 s++;
3665 continue;
3666 }
3667
3668 n = utf8_code_length[ch];
3669
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003670 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 if (consumed)
3672 break;
3673 else {
3674 errmsg = "unexpected end of data";
3675 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003676 endinpos = startinpos+1;
3677 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3678 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003679 goto utf8Error;
3680 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003681 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682
3683 switch (n) {
3684
3685 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003686 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003687 startinpos = s-starts;
3688 endinpos = startinpos+1;
3689 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690
3691 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003692 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 startinpos = s-starts;
3694 endinpos = startinpos+1;
3695 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696
3697 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003698 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003699 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003700 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003701 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003702 goto utf8Error;
3703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003705 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003706 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 break;
3708
3709 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003710 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3711 will result in surrogates in range d800-dfff. Surrogates are
3712 not valid UTF-8 so they are rejected.
3713 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3714 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003715 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003716 (s[2] & 0xc0) != 0x80 ||
3717 ((unsigned char)s[0] == 0xE0 &&
3718 (unsigned char)s[1] < 0xA0) ||
3719 ((unsigned char)s[0] == 0xED &&
3720 (unsigned char)s[1] > 0x9F)) {
3721 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003722 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003723 endinpos = startinpos + 1;
3724
3725 /* if s[1] first two bits are 1 and 0, then the invalid
3726 continuation byte is s[2], so increment endinpos by 1,
3727 if not, s[1] is invalid and endinpos doesn't need to
3728 be incremented. */
3729 if ((s[1] & 0xC0) == 0x80)
3730 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003731 goto utf8Error;
3732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003734 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003735 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003736 break;
3737
3738 case 4:
3739 if ((s[1] & 0xc0) != 0x80 ||
3740 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003741 (s[3] & 0xc0) != 0x80 ||
3742 ((unsigned char)s[0] == 0xF0 &&
3743 (unsigned char)s[1] < 0x90) ||
3744 ((unsigned char)s[0] == 0xF4 &&
3745 (unsigned char)s[1] > 0x8F)) {
3746 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003747 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003748 endinpos = startinpos + 1;
3749 if ((s[1] & 0xC0) == 0x80) {
3750 endinpos++;
3751 if ((s[2] & 0xC0) == 0x80)
3752 endinpos++;
3753 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003754 goto utf8Error;
3755 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003756 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003757 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3758 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003760 /* If the string is flexible or we have native UCS-4, write
3761 directly.. */
3762 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3763 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 else {
3766 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003768 /* translate from 10000..10FFFF to 0..FFFF */
3769 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 /* high surrogate = top 10 bits added to D800 */
3772 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3773 (Py_UNICODE)(0xD800 + (ch >> 10)));
3774
3775 /* low surrogate = bottom 10 bits added to DC00 */
3776 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3777 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3778 }
3779#if SIZEOF_WCHAR_T == 2
3780 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003781#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 }
3784 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003785 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003786
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788 /* If this is not yet a resizable string, make it one.. */
3789 if (kind != PyUnicode_WCHAR_KIND) {
3790 const Py_UNICODE *u;
3791 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3792 if (!new_unicode)
3793 goto onError;
3794 u = PyUnicode_AsUnicode((PyObject *)unicode);
3795 if (!u)
3796 goto onError;
3797#if SIZEOF_WCHAR_T == 2
3798 i += wchar_offset;
3799#endif
3800 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3801 Py_DECREF(unicode);
3802 unicode = new_unicode;
3803 kind = 0;
3804 data = PyUnicode_AS_UNICODE(new_unicode);
3805 assert(data != NULL);
3806 }
3807 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003808 if (unicode_decode_call_errorhandler(
3809 errors, &errorHandler,
3810 "utf8", errmsg,
3811 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003813 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814 /* Update data because unicode_decode_call_errorhandler might have
3815 re-created or resized the unicode object. */
3816 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003817 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819 /* Ensure the unicode_size calculation above was correct: */
3820 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3821
Walter Dörwald69652032004-09-07 20:24:22 +00003822 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003823 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003825 /* Adjust length and ready string when it contained errors and
3826 is of the old resizable kind. */
3827 if (kind == PyUnicode_WCHAR_KIND) {
3828 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3829 PyUnicode_READY(unicode) == -1)
3830 goto onError;
3831 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003833 Py_XDECREF(errorHandler);
3834 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 if (PyUnicode_READY(unicode) == -1) {
3836 Py_DECREF(unicode);
3837 return NULL;
3838 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 return (PyObject *)unicode;
3840
Benjamin Peterson29060642009-01-31 22:14:21 +00003841 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003842 Py_XDECREF(errorHandler);
3843 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 Py_DECREF(unicode);
3845 return NULL;
3846}
3847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003848#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003849
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003850#ifdef __APPLE__
3851
3852/* Simplified UTF-8 decoder using surrogateescape error handler,
3853 used to decode the command line arguments on Mac OS X. */
3854
3855wchar_t*
3856_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3857{
3858 int n;
3859 const char *e;
3860 wchar_t *unicode, *p;
3861
3862 /* Note: size will always be longer than the resulting Unicode
3863 character count */
3864 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3865 PyErr_NoMemory();
3866 return NULL;
3867 }
3868 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3869 if (!unicode)
3870 return NULL;
3871
3872 /* Unpack UTF-8 encoded data */
3873 p = unicode;
3874 e = s + size;
3875 while (s < e) {
3876 Py_UCS4 ch = (unsigned char)*s;
3877
3878 if (ch < 0x80) {
3879 *p++ = (wchar_t)ch;
3880 s++;
3881 continue;
3882 }
3883
3884 n = utf8_code_length[ch];
3885 if (s + n > e) {
3886 goto surrogateescape;
3887 }
3888
3889 switch (n) {
3890 case 0:
3891 case 1:
3892 goto surrogateescape;
3893
3894 case 2:
3895 if ((s[1] & 0xc0) != 0x80)
3896 goto surrogateescape;
3897 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3898 assert ((ch > 0x007F) && (ch <= 0x07FF));
3899 *p++ = (wchar_t)ch;
3900 break;
3901
3902 case 3:
3903 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3904 will result in surrogates in range d800-dfff. Surrogates are
3905 not valid UTF-8 so they are rejected.
3906 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3907 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3908 if ((s[1] & 0xc0) != 0x80 ||
3909 (s[2] & 0xc0) != 0x80 ||
3910 ((unsigned char)s[0] == 0xE0 &&
3911 (unsigned char)s[1] < 0xA0) ||
3912 ((unsigned char)s[0] == 0xED &&
3913 (unsigned char)s[1] > 0x9F)) {
3914
3915 goto surrogateescape;
3916 }
3917 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3918 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003920 break;
3921
3922 case 4:
3923 if ((s[1] & 0xc0) != 0x80 ||
3924 (s[2] & 0xc0) != 0x80 ||
3925 (s[3] & 0xc0) != 0x80 ||
3926 ((unsigned char)s[0] == 0xF0 &&
3927 (unsigned char)s[1] < 0x90) ||
3928 ((unsigned char)s[0] == 0xF4 &&
3929 (unsigned char)s[1] > 0x8F)) {
3930 goto surrogateescape;
3931 }
3932 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3933 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3934 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3935
3936#if SIZEOF_WCHAR_T == 4
3937 *p++ = (wchar_t)ch;
3938#else
3939 /* compute and append the two surrogates: */
3940
3941 /* translate from 10000..10FFFF to 0..FFFF */
3942 ch -= 0x10000;
3943
3944 /* high surrogate = top 10 bits added to D800 */
3945 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3946
3947 /* low surrogate = bottom 10 bits added to DC00 */
3948 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3949#endif
3950 break;
3951 }
3952 s += n;
3953 continue;
3954
3955 surrogateescape:
3956 *p++ = 0xDC00 + ch;
3957 s++;
3958 }
3959 *p = L'\0';
3960 return unicode;
3961}
3962
3963#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003965/* Primary internal function which creates utf8 encoded bytes objects.
3966
3967 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003968 and allocate exactly as much space needed at the end. Else allocate the
3969 maximum possible needed (4 result bytes per Unicode character), and return
3970 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003971*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003972PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974{
Tim Peters602f7402002-04-27 18:03:26 +00003975#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003976
Guido van Rossum98297ee2007-11-06 21:34:58 +00003977 Py_ssize_t i; /* index into s of next input byte */
3978 PyObject *result; /* result string object */
3979 char *p; /* next free byte in output buffer */
3980 Py_ssize_t nallocated; /* number of result bytes allocated */
3981 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003982 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003983 PyObject *errorHandler = NULL;
3984 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985 int kind;
3986 void *data;
3987 Py_ssize_t size;
3988 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3989#if SIZEOF_WCHAR_T == 2
3990 Py_ssize_t wchar_offset = 0;
3991#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993 if (!PyUnicode_Check(unicode)) {
3994 PyErr_BadArgument();
3995 return NULL;
3996 }
3997
3998 if (PyUnicode_READY(unicode) == -1)
3999 return NULL;
4000
4001 if (_PyUnicode_UTF8(unicode))
4002 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
4003 _PyUnicode_UTF8_LENGTH(unicode));
4004
4005 kind = PyUnicode_KIND(unicode);
4006 data = PyUnicode_DATA(unicode);
4007 size = PyUnicode_GET_LENGTH(unicode);
4008
Tim Peters602f7402002-04-27 18:03:26 +00004009 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010
Tim Peters602f7402002-04-27 18:03:26 +00004011 if (size <= MAX_SHORT_UNICHARS) {
4012 /* Write into the stack buffer; nallocated can't overflow.
4013 * At the end, we'll allocate exactly as much heap space as it
4014 * turns out we need.
4015 */
4016 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004017 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004018 p = stackbuf;
4019 }
4020 else {
4021 /* Overallocate on the heap, and give the excess back at the end. */
4022 nallocated = size * 4;
4023 if (nallocated / 4 != size) /* overflow! */
4024 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004025 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004026 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004027 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004028 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004029 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004030
Tim Peters602f7402002-04-27 18:03:26 +00004031 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004033
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004034 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004035 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004037
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004039 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004040 *p++ = (char)(0xc0 | (ch >> 6));
4041 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004042 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043 Py_ssize_t newpos;
4044 PyObject *rep;
4045 Py_ssize_t repsize, k, startpos;
4046 startpos = i-1;
4047#if SIZEOF_WCHAR_T == 2
4048 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004049#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 rep = unicode_encode_call_errorhandler(
4051 errors, &errorHandler, "utf-8", "surrogates not allowed",
4052 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4053 &exc, startpos, startpos+1, &newpos);
4054 if (!rep)
4055 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 if (PyBytes_Check(rep))
4058 repsize = PyBytes_GET_SIZE(rep);
4059 else
4060 repsize = PyUnicode_GET_SIZE(rep);
4061
4062 if (repsize > 4) {
4063 Py_ssize_t offset;
4064
4065 if (result == NULL)
4066 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004067 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004070 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4071 /* integer overflow */
4072 PyErr_NoMemory();
4073 goto error;
4074 }
4075 nallocated += repsize - 4;
4076 if (result != NULL) {
4077 if (_PyBytes_Resize(&result, nallocated) < 0)
4078 goto error;
4079 } else {
4080 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004081 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 goto error;
4083 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4084 }
4085 p = PyBytes_AS_STRING(result) + offset;
4086 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004088 if (PyBytes_Check(rep)) {
4089 char *prep = PyBytes_AS_STRING(rep);
4090 for(k = repsize; k > 0; k--)
4091 *p++ = *prep++;
4092 } else /* rep is unicode */ {
4093 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4094 Py_UNICODE c;
4095
4096 for(k=0; k<repsize; k++) {
4097 c = prep[k];
4098 if (0x80 <= c) {
4099 raise_encode_exception(&exc, "utf-8",
4100 PyUnicode_AS_UNICODE(unicode),
4101 size, i-1, i,
4102 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004103 goto error;
4104 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004106 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004107 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004108 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004109 } else if (ch < 0x10000) {
4110 *p++ = (char)(0xe0 | (ch >> 12));
4111 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4112 *p++ = (char)(0x80 | (ch & 0x3f));
4113 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004114 /* Encode UCS4 Unicode ordinals */
4115 *p++ = (char)(0xf0 | (ch >> 18));
4116 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4117 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4118 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004119#if SIZEOF_WCHAR_T == 2
4120 wchar_offset++;
4121#endif
Tim Peters602f7402002-04-27 18:03:26 +00004122 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004124
Guido van Rossum98297ee2007-11-06 21:34:58 +00004125 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004126 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004127 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004128 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004129 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004130 }
4131 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004132 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004133 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004134 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004135 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004136 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004137
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004138 Py_XDECREF(errorHandler);
4139 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004140 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004141 error:
4142 Py_XDECREF(errorHandler);
4143 Py_XDECREF(exc);
4144 Py_XDECREF(result);
4145 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004146
Tim Peters602f7402002-04-27 18:03:26 +00004147#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148}
4149
Alexander Belopolsky40018472011-02-26 01:02:56 +00004150PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004151PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4152 Py_ssize_t size,
4153 const char *errors)
4154{
4155 PyObject *v, *unicode;
4156
4157 unicode = PyUnicode_FromUnicode(s, size);
4158 if (unicode == NULL)
4159 return NULL;
4160 v = _PyUnicode_AsUTF8String(unicode, errors);
4161 Py_DECREF(unicode);
4162 return v;
4163}
4164
4165PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004166PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004168 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169}
4170
Walter Dörwald41980ca2007-08-16 21:55:45 +00004171/* --- UTF-32 Codec ------------------------------------------------------- */
4172
4173PyObject *
4174PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004175 Py_ssize_t size,
4176 const char *errors,
4177 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004178{
4179 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4180}
4181
4182PyObject *
4183PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004184 Py_ssize_t size,
4185 const char *errors,
4186 int *byteorder,
4187 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004188{
4189 const char *starts = s;
4190 Py_ssize_t startinpos;
4191 Py_ssize_t endinpos;
4192 Py_ssize_t outpos;
4193 PyUnicodeObject *unicode;
4194 Py_UNICODE *p;
4195#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004196 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004197 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004198#else
4199 const int pairs = 0;
4200#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004201 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004202 int bo = 0; /* assume native ordering by default */
4203 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004204 /* Offsets from q for retrieving bytes in the right order. */
4205#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4206 int iorder[] = {0, 1, 2, 3};
4207#else
4208 int iorder[] = {3, 2, 1, 0};
4209#endif
4210 PyObject *errorHandler = NULL;
4211 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004212
Walter Dörwald41980ca2007-08-16 21:55:45 +00004213 q = (unsigned char *)s;
4214 e = q + size;
4215
4216 if (byteorder)
4217 bo = *byteorder;
4218
4219 /* Check for BOM marks (U+FEFF) in the input and adjust current
4220 byte order setting accordingly. In native mode, the leading BOM
4221 mark is skipped, in all other modes, it is copied to the output
4222 stream as-is (giving a ZWNBSP character). */
4223 if (bo == 0) {
4224 if (size >= 4) {
4225 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004227#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 if (bom == 0x0000FEFF) {
4229 q += 4;
4230 bo = -1;
4231 }
4232 else if (bom == 0xFFFE0000) {
4233 q += 4;
4234 bo = 1;
4235 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004236#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004237 if (bom == 0x0000FEFF) {
4238 q += 4;
4239 bo = 1;
4240 }
4241 else if (bom == 0xFFFE0000) {
4242 q += 4;
4243 bo = -1;
4244 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004245#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004247 }
4248
4249 if (bo == -1) {
4250 /* force LE */
4251 iorder[0] = 0;
4252 iorder[1] = 1;
4253 iorder[2] = 2;
4254 iorder[3] = 3;
4255 }
4256 else if (bo == 1) {
4257 /* force BE */
4258 iorder[0] = 3;
4259 iorder[1] = 2;
4260 iorder[2] = 1;
4261 iorder[3] = 0;
4262 }
4263
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004264 /* On narrow builds we split characters outside the BMP into two
4265 codepoints => count how much extra space we need. */
4266#ifndef Py_UNICODE_WIDE
4267 for (qq = q; qq < e; qq += 4)
4268 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4269 pairs++;
4270#endif
4271
4272 /* This might be one to much, because of a BOM */
4273 unicode = _PyUnicode_New((size+3)/4+pairs);
4274 if (!unicode)
4275 return NULL;
4276 if (size == 0)
4277 return (PyObject *)unicode;
4278
4279 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004280 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004281
Walter Dörwald41980ca2007-08-16 21:55:45 +00004282 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 Py_UCS4 ch;
4284 /* remaining bytes at the end? (size should be divisible by 4) */
4285 if (e-q<4) {
4286 if (consumed)
4287 break;
4288 errmsg = "truncated data";
4289 startinpos = ((const char *)q)-starts;
4290 endinpos = ((const char *)e)-starts;
4291 goto utf32Error;
4292 /* The remaining input chars are ignored if the callback
4293 chooses to skip the input */
4294 }
4295 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4296 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004297
Benjamin Peterson29060642009-01-31 22:14:21 +00004298 if (ch >= 0x110000)
4299 {
4300 errmsg = "codepoint not in range(0x110000)";
4301 startinpos = ((const char *)q)-starts;
4302 endinpos = startinpos+4;
4303 goto utf32Error;
4304 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004305#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004306 if (ch >= 0x10000)
4307 {
4308 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4309 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4310 }
4311 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004312#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004313 *p++ = ch;
4314 q += 4;
4315 continue;
4316 utf32Error:
4317 outpos = p-PyUnicode_AS_UNICODE(unicode);
4318 if (unicode_decode_call_errorhandler(
4319 errors, &errorHandler,
4320 "utf32", errmsg,
4321 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4322 &unicode, &outpos, &p))
4323 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004324 }
4325
4326 if (byteorder)
4327 *byteorder = bo;
4328
4329 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004330 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004331
4332 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004333 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004334 goto onError;
4335
4336 Py_XDECREF(errorHandler);
4337 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004338 if (PyUnicode_READY(unicode) == -1) {
4339 Py_DECREF(unicode);
4340 return NULL;
4341 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004342 return (PyObject *)unicode;
4343
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004345 Py_DECREF(unicode);
4346 Py_XDECREF(errorHandler);
4347 Py_XDECREF(exc);
4348 return NULL;
4349}
4350
4351PyObject *
4352PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 Py_ssize_t size,
4354 const char *errors,
4355 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004356{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004357 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004358 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004359 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004360#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004361 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004362#else
4363 const int pairs = 0;
4364#endif
4365 /* Offsets from p for storing byte pairs in the right order. */
4366#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4367 int iorder[] = {0, 1, 2, 3};
4368#else
4369 int iorder[] = {3, 2, 1, 0};
4370#endif
4371
Benjamin Peterson29060642009-01-31 22:14:21 +00004372#define STORECHAR(CH) \
4373 do { \
4374 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4375 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4376 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4377 p[iorder[0]] = (CH) & 0xff; \
4378 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004379 } while(0)
4380
4381 /* In narrow builds we can output surrogate pairs as one codepoint,
4382 so we need less space. */
4383#ifndef Py_UNICODE_WIDE
4384 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4386 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4387 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004388#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004389 nsize = (size - pairs + (byteorder == 0));
4390 bytesize = nsize * 4;
4391 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004392 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004393 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004394 if (v == NULL)
4395 return NULL;
4396
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004397 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004398 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004400 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004401 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004402
4403 if (byteorder == -1) {
4404 /* force LE */
4405 iorder[0] = 0;
4406 iorder[1] = 1;
4407 iorder[2] = 2;
4408 iorder[3] = 3;
4409 }
4410 else if (byteorder == 1) {
4411 /* force BE */
4412 iorder[0] = 3;
4413 iorder[1] = 2;
4414 iorder[2] = 1;
4415 iorder[3] = 0;
4416 }
4417
4418 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004419 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004420#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004421 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4422 Py_UCS4 ch2 = *s;
4423 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4424 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4425 s++;
4426 size--;
4427 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004428 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004429#endif
4430 STORECHAR(ch);
4431 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004432
4433 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004434 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004435#undef STORECHAR
4436}
4437
Alexander Belopolsky40018472011-02-26 01:02:56 +00004438PyObject *
4439PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004440{
4441 if (!PyUnicode_Check(unicode)) {
4442 PyErr_BadArgument();
4443 return NULL;
4444 }
4445 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 PyUnicode_GET_SIZE(unicode),
4447 NULL,
4448 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004449}
4450
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451/* --- UTF-16 Codec ------------------------------------------------------- */
4452
Tim Peters772747b2001-08-09 22:21:55 +00004453PyObject *
4454PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004455 Py_ssize_t size,
4456 const char *errors,
4457 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458{
Walter Dörwald69652032004-09-07 20:24:22 +00004459 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4460}
4461
Antoine Pitrouab868312009-01-10 15:40:25 +00004462/* Two masks for fast checking of whether a C 'long' may contain
4463 UTF16-encoded surrogate characters. This is an efficient heuristic,
4464 assuming that non-surrogate characters with a code point >= 0x8000 are
4465 rare in most input.
4466 FAST_CHAR_MASK is used when the input is in native byte ordering,
4467 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004468*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004469#if (SIZEOF_LONG == 8)
4470# define FAST_CHAR_MASK 0x8000800080008000L
4471# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4472#elif (SIZEOF_LONG == 4)
4473# define FAST_CHAR_MASK 0x80008000L
4474# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4475#else
4476# error C 'long' size should be either 4 or 8!
4477#endif
4478
Walter Dörwald69652032004-09-07 20:24:22 +00004479PyObject *
4480PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 Py_ssize_t size,
4482 const char *errors,
4483 int *byteorder,
4484 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004485{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004487 Py_ssize_t startinpos;
4488 Py_ssize_t endinpos;
4489 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 PyUnicodeObject *unicode;
4491 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004492 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004493 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004494 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004495 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004496 /* Offsets from q for retrieving byte pairs in the right order. */
4497#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4498 int ihi = 1, ilo = 0;
4499#else
4500 int ihi = 0, ilo = 1;
4501#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502 PyObject *errorHandler = NULL;
4503 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504
4505 /* Note: size will always be longer than the resulting Unicode
4506 character count */
4507 unicode = _PyUnicode_New(size);
4508 if (!unicode)
4509 return NULL;
4510 if (size == 0)
4511 return (PyObject *)unicode;
4512
4513 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004514 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004515 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004516 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517
4518 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004519 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004521 /* Check for BOM marks (U+FEFF) in the input and adjust current
4522 byte order setting accordingly. In native mode, the leading BOM
4523 mark is skipped, in all other modes, it is copied to the output
4524 stream as-is (giving a ZWNBSP character). */
4525 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004526 if (size >= 2) {
4527 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004528#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 if (bom == 0xFEFF) {
4530 q += 2;
4531 bo = -1;
4532 }
4533 else if (bom == 0xFFFE) {
4534 q += 2;
4535 bo = 1;
4536 }
Tim Petersced69f82003-09-16 20:30:58 +00004537#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004538 if (bom == 0xFEFF) {
4539 q += 2;
4540 bo = 1;
4541 }
4542 else if (bom == 0xFFFE) {
4543 q += 2;
4544 bo = -1;
4545 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004546#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004547 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004548 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004549
Tim Peters772747b2001-08-09 22:21:55 +00004550 if (bo == -1) {
4551 /* force LE */
4552 ihi = 1;
4553 ilo = 0;
4554 }
4555 else if (bo == 1) {
4556 /* force BE */
4557 ihi = 0;
4558 ilo = 1;
4559 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004560#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4561 native_ordering = ilo < ihi;
4562#else
4563 native_ordering = ilo > ihi;
4564#endif
Tim Peters772747b2001-08-09 22:21:55 +00004565
Antoine Pitrouab868312009-01-10 15:40:25 +00004566 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004567 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004568 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004569 /* First check for possible aligned read of a C 'long'. Unaligned
4570 reads are more expensive, better to defer to another iteration. */
4571 if (!((size_t) q & LONG_PTR_MASK)) {
4572 /* Fast path for runs of non-surrogate chars. */
4573 register const unsigned char *_q = q;
4574 Py_UNICODE *_p = p;
4575 if (native_ordering) {
4576 /* Native ordering is simple: as long as the input cannot
4577 possibly contain a surrogate char, do an unrolled copy
4578 of several 16-bit code points to the target object.
4579 The non-surrogate check is done on several input bytes
4580 at a time (as many as a C 'long' can contain). */
4581 while (_q < aligned_end) {
4582 unsigned long data = * (unsigned long *) _q;
4583 if (data & FAST_CHAR_MASK)
4584 break;
4585 _p[0] = ((unsigned short *) _q)[0];
4586 _p[1] = ((unsigned short *) _q)[1];
4587#if (SIZEOF_LONG == 8)
4588 _p[2] = ((unsigned short *) _q)[2];
4589 _p[3] = ((unsigned short *) _q)[3];
4590#endif
4591 _q += SIZEOF_LONG;
4592 _p += SIZEOF_LONG / 2;
4593 }
4594 }
4595 else {
4596 /* Byteswapped ordering is similar, but we must decompose
4597 the copy bytewise, and take care of zero'ing out the
4598 upper bytes if the target object is in 32-bit units
4599 (that is, in UCS-4 builds). */
4600 while (_q < aligned_end) {
4601 unsigned long data = * (unsigned long *) _q;
4602 if (data & SWAPPED_FAST_CHAR_MASK)
4603 break;
4604 /* Zero upper bytes in UCS-4 builds */
4605#if (Py_UNICODE_SIZE > 2)
4606 _p[0] = 0;
4607 _p[1] = 0;
4608#if (SIZEOF_LONG == 8)
4609 _p[2] = 0;
4610 _p[3] = 0;
4611#endif
4612#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004613 /* Issue #4916; UCS-4 builds on big endian machines must
4614 fill the two last bytes of each 4-byte unit. */
4615#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4616# define OFF 2
4617#else
4618# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004619#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004620 ((unsigned char *) _p)[OFF + 1] = _q[0];
4621 ((unsigned char *) _p)[OFF + 0] = _q[1];
4622 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4623 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4624#if (SIZEOF_LONG == 8)
4625 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4626 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4627 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4628 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4629#endif
4630#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004631 _q += SIZEOF_LONG;
4632 _p += SIZEOF_LONG / 2;
4633 }
4634 }
4635 p = _p;
4636 q = _q;
4637 if (q >= e)
4638 break;
4639 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004640 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641
Benjamin Peterson14339b62009-01-31 16:36:08 +00004642 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004643
4644 if (ch < 0xD800 || ch > 0xDFFF) {
4645 *p++ = ch;
4646 continue;
4647 }
4648
4649 /* UTF-16 code pair: */
4650 if (q > e) {
4651 errmsg = "unexpected end of data";
4652 startinpos = (((const char *)q) - 2) - starts;
4653 endinpos = ((const char *)e) + 1 - starts;
4654 goto utf16Error;
4655 }
4656 if (0xD800 <= ch && ch <= 0xDBFF) {
4657 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4658 q += 2;
4659 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004660#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004661 *p++ = ch;
4662 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004663#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004664 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004665#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004666 continue;
4667 }
4668 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004669 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004670 startinpos = (((const char *)q)-4)-starts;
4671 endinpos = startinpos+2;
4672 goto utf16Error;
4673 }
4674
Benjamin Peterson14339b62009-01-31 16:36:08 +00004675 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004676 errmsg = "illegal encoding";
4677 startinpos = (((const char *)q)-2)-starts;
4678 endinpos = startinpos+2;
4679 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004680
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 utf16Error:
4682 outpos = p - PyUnicode_AS_UNICODE(unicode);
4683 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004684 errors,
4685 &errorHandler,
4686 "utf16", errmsg,
4687 &starts,
4688 (const char **)&e,
4689 &startinpos,
4690 &endinpos,
4691 &exc,
4692 (const char **)&q,
4693 &unicode,
4694 &outpos,
4695 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004696 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004698 /* remaining byte at the end? (size should be even) */
4699 if (e == q) {
4700 if (!consumed) {
4701 errmsg = "truncated data";
4702 startinpos = ((const char *)q) - starts;
4703 endinpos = ((const char *)e) + 1 - starts;
4704 outpos = p - PyUnicode_AS_UNICODE(unicode);
4705 if (unicode_decode_call_errorhandler(
4706 errors,
4707 &errorHandler,
4708 "utf16", errmsg,
4709 &starts,
4710 (const char **)&e,
4711 &startinpos,
4712 &endinpos,
4713 &exc,
4714 (const char **)&q,
4715 &unicode,
4716 &outpos,
4717 &p))
4718 goto onError;
4719 /* The remaining input chars are ignored if the callback
4720 chooses to skip the input */
4721 }
4722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723
4724 if (byteorder)
4725 *byteorder = bo;
4726
Walter Dörwald69652032004-09-07 20:24:22 +00004727 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004728 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004729
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004731 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 goto onError;
4733
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004734 Py_XDECREF(errorHandler);
4735 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004736 if (PyUnicode_READY(unicode) == -1) {
4737 Py_DECREF(unicode);
4738 return NULL;
4739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 return (PyObject *)unicode;
4741
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004744 Py_XDECREF(errorHandler);
4745 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 return NULL;
4747}
4748
Antoine Pitrouab868312009-01-10 15:40:25 +00004749#undef FAST_CHAR_MASK
4750#undef SWAPPED_FAST_CHAR_MASK
4751
Tim Peters772747b2001-08-09 22:21:55 +00004752PyObject *
4753PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004754 Py_ssize_t size,
4755 const char *errors,
4756 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004758 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004759 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004760 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004761#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004762 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004763#else
4764 const int pairs = 0;
4765#endif
Tim Peters772747b2001-08-09 22:21:55 +00004766 /* Offsets from p for storing byte pairs in the right order. */
4767#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4768 int ihi = 1, ilo = 0;
4769#else
4770 int ihi = 0, ilo = 1;
4771#endif
4772
Benjamin Peterson29060642009-01-31 22:14:21 +00004773#define STORECHAR(CH) \
4774 do { \
4775 p[ihi] = ((CH) >> 8) & 0xff; \
4776 p[ilo] = (CH) & 0xff; \
4777 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004778 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004780#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004781 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 if (s[i] >= 0x10000)
4783 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004784#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004785 /* 2 * (size + pairs + (byteorder == 0)) */
4786 if (size > PY_SSIZE_T_MAX ||
4787 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004788 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004789 nsize = size + pairs + (byteorder == 0);
4790 bytesize = nsize * 2;
4791 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004793 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 if (v == NULL)
4795 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004797 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004799 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004800 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004801 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004802
4803 if (byteorder == -1) {
4804 /* force LE */
4805 ihi = 1;
4806 ilo = 0;
4807 }
4808 else if (byteorder == 1) {
4809 /* force BE */
4810 ihi = 0;
4811 ilo = 1;
4812 }
4813
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004814 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004815 Py_UNICODE ch = *s++;
4816 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004817#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 if (ch >= 0x10000) {
4819 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4820 ch = 0xD800 | ((ch-0x10000) >> 10);
4821 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004822#endif
Tim Peters772747b2001-08-09 22:21:55 +00004823 STORECHAR(ch);
4824 if (ch2)
4825 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004826 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004827
4828 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004829 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004830#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831}
4832
Alexander Belopolsky40018472011-02-26 01:02:56 +00004833PyObject *
4834PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835{
4836 if (!PyUnicode_Check(unicode)) {
4837 PyErr_BadArgument();
4838 return NULL;
4839 }
4840 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004841 PyUnicode_GET_SIZE(unicode),
4842 NULL,
4843 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844}
4845
4846/* --- Unicode Escape Codec ----------------------------------------------- */
4847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004848/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4849 if all the escapes in the string make it still a valid ASCII string.
4850 Returns -1 if any escapes were found which cause the string to
4851 pop out of ASCII range. Otherwise returns the length of the
4852 required buffer to hold the string.
4853 */
4854Py_ssize_t
4855length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4856{
4857 const unsigned char *p = (const unsigned char *)s;
4858 const unsigned char *end = p + size;
4859 Py_ssize_t length = 0;
4860
4861 if (size < 0)
4862 return -1;
4863
4864 for (; p < end; ++p) {
4865 if (*p > 127) {
4866 /* Non-ASCII */
4867 return -1;
4868 }
4869 else if (*p != '\\') {
4870 /* Normal character */
4871 ++length;
4872 }
4873 else {
4874 /* Backslash-escape, check next char */
4875 ++p;
4876 /* Escape sequence reaches till end of string or
4877 non-ASCII follow-up. */
4878 if (p >= end || *p > 127)
4879 return -1;
4880 switch (*p) {
4881 case '\n':
4882 /* backslash + \n result in zero characters */
4883 break;
4884 case '\\': case '\'': case '\"':
4885 case 'b': case 'f': case 't':
4886 case 'n': case 'r': case 'v': case 'a':
4887 ++length;
4888 break;
4889 case '0': case '1': case '2': case '3':
4890 case '4': case '5': case '6': case '7':
4891 case 'x': case 'u': case 'U': case 'N':
4892 /* these do not guarantee ASCII characters */
4893 return -1;
4894 default:
4895 /* count the backslash + the other character */
4896 length += 2;
4897 }
4898 }
4899 }
4900 return length;
4901}
4902
4903/* Similar to PyUnicode_WRITE but either write into wstr field
4904 or treat string as ASCII. */
4905#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4906 do { \
4907 if ((kind) != PyUnicode_WCHAR_KIND) \
4908 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4909 else \
4910 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4911 } while (0)
4912
4913#define WRITE_WSTR(buf, index, value) \
4914 assert(kind == PyUnicode_WCHAR_KIND), \
4915 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4916
4917
Fredrik Lundh06d12682001-01-24 07:59:11 +00004918static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004919
Alexander Belopolsky40018472011-02-26 01:02:56 +00004920PyObject *
4921PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004922 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004923 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004925 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004926 Py_ssize_t startinpos;
4927 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004928 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004932 char* message;
4933 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004934 PyObject *errorHandler = NULL;
4935 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004936 Py_ssize_t ascii_length;
4937 Py_ssize_t i;
4938 int kind;
4939 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004941 ascii_length = length_of_escaped_ascii_string(s, size);
4942
4943 /* After length_of_escaped_ascii_string() there are two alternatives,
4944 either the string is pure ASCII with named escapes like \n, etc.
4945 and we determined it's exact size (common case)
4946 or it contains \x, \u, ... escape sequences. then we create a
4947 legacy wchar string and resize it at the end of this function. */
4948 if (ascii_length >= 0) {
4949 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4950 if (!v)
4951 goto onError;
4952 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4953 kind = PyUnicode_1BYTE_KIND;
4954 data = PyUnicode_DATA(v);
4955 }
4956 else {
4957 /* Escaped strings will always be longer than the resulting
4958 Unicode string, so we start with size here and then reduce the
4959 length after conversion to the true value.
4960 (but if the error callback returns a long replacement string
4961 we'll have to allocate more space) */
4962 v = _PyUnicode_New(size);
4963 if (!v)
4964 goto onError;
4965 kind = PyUnicode_WCHAR_KIND;
4966 data = PyUnicode_AS_UNICODE(v);
4967 }
4968
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969 if (size == 0)
4970 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004971 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004973
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 while (s < end) {
4975 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004976 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004977 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004979 if (kind == PyUnicode_WCHAR_KIND) {
4980 assert(i < _PyUnicode_WSTR_LENGTH(v));
4981 }
4982 else {
4983 /* The only case in which i == ascii_length is a backslash
4984 followed by a newline. */
4985 assert(i <= ascii_length);
4986 }
4987
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 /* Non-escape characters are interpreted as Unicode ordinals */
4989 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004990 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 continue;
4992 }
4993
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995 /* \ - Escapes */
4996 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004997 c = *s++;
4998 if (s > end)
4999 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005000
5001 if (kind == PyUnicode_WCHAR_KIND) {
5002 assert(i < _PyUnicode_WSTR_LENGTH(v));
5003 }
5004 else {
5005 /* The only case in which i == ascii_length is a backslash
5006 followed by a newline. */
5007 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5008 }
5009
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005010 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005011
Benjamin Peterson29060642009-01-31 22:14:21 +00005012 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005014 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5015 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5016 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5017 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5018 /* FF */
5019 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5020 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5021 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5022 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5023 /* VT */
5024 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5025 /* BEL, not classic C */
5026 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029 case '0': case '1': case '2': case '3':
5030 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005031 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005032 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005033 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005034 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005035 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005037 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005038 break;
5039
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 /* hex escapes */
5041 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005043 digits = 2;
5044 message = "truncated \\xXX escape";
5045 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005049 digits = 4;
5050 message = "truncated \\uXXXX escape";
5051 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005054 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005055 digits = 8;
5056 message = "truncated \\UXXXXXXXX escape";
5057 hexescape:
5058 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005059 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005060 if (s+digits>end) {
5061 endinpos = size;
5062 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 errors, &errorHandler,
5064 "unicodeescape", "end of string in escape sequence",
5065 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005066 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005067 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005068 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005069 goto nextByte;
5070 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005071 for (j = 0; j < digits; ++j) {
5072 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005073 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005074 endinpos = (s+j+1)-starts;
5075 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005076 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 errors, &errorHandler,
5078 "unicodeescape", message,
5079 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005080 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005081 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005082 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005083 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005084 }
5085 chr = (chr<<4) & ~0xF;
5086 if (c >= '0' && c <= '9')
5087 chr += c - '0';
5088 else if (c >= 'a' && c <= 'f')
5089 chr += 10 + c - 'a';
5090 else
5091 chr += 10 + c - 'A';
5092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005093 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005094 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005095 /* _decoding_error will have already written into the
5096 target buffer. */
5097 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005098 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005099 /* when we get here, chr is a 32-bit unicode character */
5100 if (chr <= 0xffff)
5101 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005102 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005103 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005104 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005105 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005106#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005107 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005108#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005109 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005110 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5111 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005112#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005113 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005115 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005116 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005117 errors, &errorHandler,
5118 "unicodeescape", "illegal Unicode character",
5119 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005120 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005121 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005122 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005123 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005124 break;
5125
Benjamin Peterson29060642009-01-31 22:14:21 +00005126 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005127 case 'N':
5128 message = "malformed \\N character escape";
5129 if (ucnhash_CAPI == NULL) {
5130 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005131 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5132 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005133 if (ucnhash_CAPI == NULL)
5134 goto ucnhashError;
5135 }
5136 if (*s == '{') {
5137 const char *start = s+1;
5138 /* look for the closing brace */
5139 while (*s != '}' && s < end)
5140 s++;
5141 if (s > start && s < end && *s == '}') {
5142 /* found a name. look it up in the unicode database */
5143 message = "unknown Unicode character name";
5144 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005145 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5146 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005147 goto store;
5148 }
5149 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005150 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005151 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005152 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005153 errors, &errorHandler,
5154 "unicodeescape", message,
5155 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005156 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005157 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005158 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005159 break;
5160
5161 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005162 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005163 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005164 message = "\\ at end of string";
5165 s--;
5166 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005167 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005168 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005169 errors, &errorHandler,
5170 "unicodeescape", message,
5171 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005172 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005173 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005174 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005175 }
5176 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005177 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5178 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005179 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005180 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005183 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005185 /* Ensure the length prediction worked in case of ASCII strings */
5186 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5187
5188 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5189 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005190 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005191 Py_XDECREF(errorHandler);
5192 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005194
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005196 PyErr_SetString(
5197 PyExc_UnicodeError,
5198 "\\N escapes not supported (can't load unicodedata module)"
5199 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005200 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005201 Py_XDECREF(errorHandler);
5202 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005203 return NULL;
5204
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005207 Py_XDECREF(errorHandler);
5208 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 return NULL;
5210}
5211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005212#undef WRITE_ASCII_OR_WSTR
5213#undef WRITE_WSTR
5214
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215/* Return a Unicode-Escape string version of the Unicode object.
5216
5217 If quotes is true, the string is enclosed in u"" or u'' quotes as
5218 appropriate.
5219
5220*/
5221
Walter Dörwald79e913e2007-05-12 11:08:06 +00005222static const char *hexdigits = "0123456789abcdef";
5223
Alexander Belopolsky40018472011-02-26 01:02:56 +00005224PyObject *
5225PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005226 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005228 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005231#ifdef Py_UNICODE_WIDE
5232 const Py_ssize_t expandsize = 10;
5233#else
5234 const Py_ssize_t expandsize = 6;
5235#endif
5236
Thomas Wouters89f507f2006-12-13 04:49:30 +00005237 /* XXX(nnorwitz): rather than over-allocating, it would be
5238 better to choose a different scheme. Perhaps scan the
5239 first N-chars of the string and allocate based on that size.
5240 */
5241 /* Initial allocation is based on the longest-possible unichr
5242 escape.
5243
5244 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5245 unichr, so in this case it's the longest unichr escape. In
5246 narrow (UTF-16) builds this is five chars per source unichr
5247 since there are two unichrs in the surrogate pair, so in narrow
5248 (UTF-16) builds it's not the longest unichr escape.
5249
5250 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5251 so in the narrow (UTF-16) build case it's the longest unichr
5252 escape.
5253 */
5254
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005255 if (size == 0)
5256 return PyBytes_FromStringAndSize(NULL, 0);
5257
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005258 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005259 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005260
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005261 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005262 2
5263 + expandsize*size
5264 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 if (repr == NULL)
5266 return NULL;
5267
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005268 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 while (size-- > 0) {
5271 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005272
Walter Dörwald79e913e2007-05-12 11:08:06 +00005273 /* Escape backslashes */
5274 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 *p++ = '\\';
5276 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005277 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005278 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005279
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005280#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005281 /* Map 21-bit characters to '\U00xxxxxx' */
5282 else if (ch >= 0x10000) {
5283 *p++ = '\\';
5284 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005285 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5286 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5287 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5288 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5289 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5290 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5291 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5292 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005294 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005295#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5297 else if (ch >= 0xD800 && ch < 0xDC00) {
5298 Py_UNICODE ch2;
5299 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005300
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 ch2 = *s++;
5302 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005303 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005304 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5305 *p++ = '\\';
5306 *p++ = 'U';
5307 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5308 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5309 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5310 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5311 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5312 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5313 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5314 *p++ = hexdigits[ucs & 0x0000000F];
5315 continue;
5316 }
5317 /* Fall through: isolated surrogates are copied as-is */
5318 s--;
5319 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005320 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005321#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005322
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005324 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 *p++ = '\\';
5326 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005327 *p++ = hexdigits[(ch >> 12) & 0x000F];
5328 *p++ = hexdigits[(ch >> 8) & 0x000F];
5329 *p++ = hexdigits[(ch >> 4) & 0x000F];
5330 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005332
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005333 /* Map special whitespace to '\t', \n', '\r' */
5334 else if (ch == '\t') {
5335 *p++ = '\\';
5336 *p++ = 't';
5337 }
5338 else if (ch == '\n') {
5339 *p++ = '\\';
5340 *p++ = 'n';
5341 }
5342 else if (ch == '\r') {
5343 *p++ = '\\';
5344 *p++ = 'r';
5345 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005346
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005347 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005348 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005350 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005351 *p++ = hexdigits[(ch >> 4) & 0x000F];
5352 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005353 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005354
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 /* Copy everything else as-is */
5356 else
5357 *p++ = (char) ch;
5358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005360 assert(p - PyBytes_AS_STRING(repr) > 0);
5361 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5362 return NULL;
5363 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364}
5365
Alexander Belopolsky40018472011-02-26 01:02:56 +00005366PyObject *
5367PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005369 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370 if (!PyUnicode_Check(unicode)) {
5371 PyErr_BadArgument();
5372 return NULL;
5373 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005374 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5375 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005376 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377}
5378
5379/* --- Raw Unicode Escape Codec ------------------------------------------- */
5380
Alexander Belopolsky40018472011-02-26 01:02:56 +00005381PyObject *
5382PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005383 Py_ssize_t size,
5384 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005386 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005387 Py_ssize_t startinpos;
5388 Py_ssize_t endinpos;
5389 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005391 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 const char *end;
5393 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005394 PyObject *errorHandler = NULL;
5395 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005396
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 /* Escaped strings will always be longer than the resulting
5398 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005399 length after conversion to the true value. (But decoding error
5400 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 v = _PyUnicode_New(size);
5402 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005406 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 end = s + size;
5408 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 unsigned char c;
5410 Py_UCS4 x;
5411 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005412 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 /* Non-escape characters are interpreted as Unicode ordinals */
5415 if (*s != '\\') {
5416 *p++ = (unsigned char)*s++;
5417 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005418 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 startinpos = s-starts;
5420
5421 /* \u-escapes are only interpreted iff the number of leading
5422 backslashes if odd */
5423 bs = s;
5424 for (;s < end;) {
5425 if (*s != '\\')
5426 break;
5427 *p++ = (unsigned char)*s++;
5428 }
5429 if (((s - bs) & 1) == 0 ||
5430 s >= end ||
5431 (*s != 'u' && *s != 'U')) {
5432 continue;
5433 }
5434 p--;
5435 count = *s=='u' ? 4 : 8;
5436 s++;
5437
5438 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5439 outpos = p-PyUnicode_AS_UNICODE(v);
5440 for (x = 0, i = 0; i < count; ++i, ++s) {
5441 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005442 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005443 endinpos = s-starts;
5444 if (unicode_decode_call_errorhandler(
5445 errors, &errorHandler,
5446 "rawunicodeescape", "truncated \\uXXXX",
5447 &starts, &end, &startinpos, &endinpos, &exc, &s,
5448 &v, &outpos, &p))
5449 goto onError;
5450 goto nextByte;
5451 }
5452 x = (x<<4) & ~0xF;
5453 if (c >= '0' && c <= '9')
5454 x += c - '0';
5455 else if (c >= 'a' && c <= 'f')
5456 x += 10 + c - 'a';
5457 else
5458 x += 10 + c - 'A';
5459 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005460 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005461 /* UCS-2 character */
5462 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005463 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 /* UCS-4 character. Either store directly, or as
5465 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005466#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005468#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005469 x -= 0x10000L;
5470 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5471 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005472#endif
5473 } else {
5474 endinpos = s-starts;
5475 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005476 if (unicode_decode_call_errorhandler(
5477 errors, &errorHandler,
5478 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 &starts, &end, &startinpos, &endinpos, &exc, &s,
5480 &v, &outpos, &p))
5481 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005482 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 nextByte:
5484 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005486 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005487 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 Py_XDECREF(errorHandler);
5489 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005490 if (PyUnicode_READY(v) == -1) {
5491 Py_DECREF(v);
5492 return NULL;
5493 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005495
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005498 Py_XDECREF(errorHandler);
5499 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 return NULL;
5501}
5502
Alexander Belopolsky40018472011-02-26 01:02:56 +00005503PyObject *
5504PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005505 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005507 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508 char *p;
5509 char *q;
5510
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005511#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005512 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005513#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005514 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005515#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005516
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005517 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005519
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005520 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 if (repr == NULL)
5522 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005523 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005524 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005526 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 while (size-- > 0) {
5528 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005529#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005530 /* Map 32-bit characters to '\Uxxxxxxxx' */
5531 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005532 *p++ = '\\';
5533 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005534 *p++ = hexdigits[(ch >> 28) & 0xf];
5535 *p++ = hexdigits[(ch >> 24) & 0xf];
5536 *p++ = hexdigits[(ch >> 20) & 0xf];
5537 *p++ = hexdigits[(ch >> 16) & 0xf];
5538 *p++ = hexdigits[(ch >> 12) & 0xf];
5539 *p++ = hexdigits[(ch >> 8) & 0xf];
5540 *p++ = hexdigits[(ch >> 4) & 0xf];
5541 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005542 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005543 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005544#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005545 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5546 if (ch >= 0xD800 && ch < 0xDC00) {
5547 Py_UNICODE ch2;
5548 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005549
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 ch2 = *s++;
5551 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005552 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5554 *p++ = '\\';
5555 *p++ = 'U';
5556 *p++ = hexdigits[(ucs >> 28) & 0xf];
5557 *p++ = hexdigits[(ucs >> 24) & 0xf];
5558 *p++ = hexdigits[(ucs >> 20) & 0xf];
5559 *p++ = hexdigits[(ucs >> 16) & 0xf];
5560 *p++ = hexdigits[(ucs >> 12) & 0xf];
5561 *p++ = hexdigits[(ucs >> 8) & 0xf];
5562 *p++ = hexdigits[(ucs >> 4) & 0xf];
5563 *p++ = hexdigits[ucs & 0xf];
5564 continue;
5565 }
5566 /* Fall through: isolated surrogates are copied as-is */
5567 s--;
5568 size++;
5569 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005570#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005571 /* Map 16-bit characters to '\uxxxx' */
5572 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 *p++ = '\\';
5574 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005575 *p++ = hexdigits[(ch >> 12) & 0xf];
5576 *p++ = hexdigits[(ch >> 8) & 0xf];
5577 *p++ = hexdigits[(ch >> 4) & 0xf];
5578 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 /* Copy everything else as-is */
5581 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 *p++ = (char) ch;
5583 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005584 size = p - q;
5585
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005586 assert(size > 0);
5587 if (_PyBytes_Resize(&repr, size) < 0)
5588 return NULL;
5589 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590}
5591
Alexander Belopolsky40018472011-02-26 01:02:56 +00005592PyObject *
5593PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005595 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005597 PyErr_BadArgument();
5598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005600 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5601 PyUnicode_GET_SIZE(unicode));
5602
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005603 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604}
5605
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005606/* --- Unicode Internal Codec ------------------------------------------- */
5607
Alexander Belopolsky40018472011-02-26 01:02:56 +00005608PyObject *
5609_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005610 Py_ssize_t size,
5611 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005612{
5613 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005614 Py_ssize_t startinpos;
5615 Py_ssize_t endinpos;
5616 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005617 PyUnicodeObject *v;
5618 Py_UNICODE *p;
5619 const char *end;
5620 const char *reason;
5621 PyObject *errorHandler = NULL;
5622 PyObject *exc = NULL;
5623
Neal Norwitzd43069c2006-01-08 01:12:10 +00005624#ifdef Py_UNICODE_WIDE
5625 Py_UNICODE unimax = PyUnicode_GetMax();
5626#endif
5627
Thomas Wouters89f507f2006-12-13 04:49:30 +00005628 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005629 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5630 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005632 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5633 as string was created with the old API. */
5634 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005636 p = PyUnicode_AS_UNICODE(v);
5637 end = s + size;
5638
5639 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005640 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005641 /* We have to sanity check the raw data, otherwise doom looms for
5642 some malformed UCS-4 data. */
5643 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005644#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005645 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005646#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005647 end-s < Py_UNICODE_SIZE
5648 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005650 startinpos = s - starts;
5651 if (end-s < Py_UNICODE_SIZE) {
5652 endinpos = end-starts;
5653 reason = "truncated input";
5654 }
5655 else {
5656 endinpos = s - starts + Py_UNICODE_SIZE;
5657 reason = "illegal code point (> 0x10FFFF)";
5658 }
5659 outpos = p - PyUnicode_AS_UNICODE(v);
5660 if (unicode_decode_call_errorhandler(
5661 errors, &errorHandler,
5662 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005663 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005664 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005665 goto onError;
5666 }
5667 }
5668 else {
5669 p++;
5670 s += Py_UNICODE_SIZE;
5671 }
5672 }
5673
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005674 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005675 goto onError;
5676 Py_XDECREF(errorHandler);
5677 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005678 if (PyUnicode_READY(v) == -1) {
5679 Py_DECREF(v);
5680 return NULL;
5681 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005682 return (PyObject *)v;
5683
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005685 Py_XDECREF(v);
5686 Py_XDECREF(errorHandler);
5687 Py_XDECREF(exc);
5688 return NULL;
5689}
5690
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691/* --- Latin-1 Codec ------------------------------------------------------ */
5692
Alexander Belopolsky40018472011-02-26 01:02:56 +00005693PyObject *
5694PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005695 Py_ssize_t size,
5696 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005699 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700}
5701
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005703static void
5704make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005705 const char *encoding,
5706 const Py_UNICODE *unicode, Py_ssize_t size,
5707 Py_ssize_t startpos, Py_ssize_t endpos,
5708 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 *exceptionObject = PyUnicodeEncodeError_Create(
5712 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 }
5714 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5716 goto onError;
5717 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5718 goto onError;
5719 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5720 goto onError;
5721 return;
5722 onError:
5723 Py_DECREF(*exceptionObject);
5724 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725 }
5726}
5727
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005729static void
5730raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005731 const char *encoding,
5732 const Py_UNICODE *unicode, Py_ssize_t size,
5733 Py_ssize_t startpos, Py_ssize_t endpos,
5734 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735{
5736 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005738 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740}
5741
5742/* error handling callback helper:
5743 build arguments, call the callback and check the arguments,
5744 put the result into newpos and return the replacement string, which
5745 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005746static PyObject *
5747unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005748 PyObject **errorHandler,
5749 const char *encoding, const char *reason,
5750 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5751 Py_ssize_t startpos, Py_ssize_t endpos,
5752 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005754 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755
5756 PyObject *restuple;
5757 PyObject *resunicode;
5758
5759 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005763 }
5764
5765 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769
5770 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005772 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005775 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 Py_DECREF(restuple);
5777 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005778 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005779 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 &resunicode, newpos)) {
5781 Py_DECREF(restuple);
5782 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005783 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005784 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5785 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5786 Py_DECREF(restuple);
5787 return NULL;
5788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005789 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005791 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5793 Py_DECREF(restuple);
5794 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005795 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 Py_INCREF(resunicode);
5797 Py_DECREF(restuple);
5798 return resunicode;
5799}
5800
Alexander Belopolsky40018472011-02-26 01:02:56 +00005801static PyObject *
5802unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005803 Py_ssize_t size,
5804 const char *errors,
5805 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806{
5807 /* output object */
5808 PyObject *res;
5809 /* pointers to the beginning and end+1 of input */
5810 const Py_UNICODE *startp = p;
5811 const Py_UNICODE *endp = p + size;
5812 /* pointer to the beginning of the unencodable characters */
5813 /* const Py_UNICODE *badp = NULL; */
5814 /* pointer into the output */
5815 char *str;
5816 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005817 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005818 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5819 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005820 PyObject *errorHandler = NULL;
5821 PyObject *exc = NULL;
5822 /* the following variable is used for caching string comparisons
5823 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5824 int known_errorHandler = -1;
5825
5826 /* allocate enough for a simple encoding without
5827 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005828 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005829 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005830 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005831 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005832 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005833 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834 ressize = size;
5835
5836 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005838
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 /* can we encode this? */
5840 if (c<limit) {
5841 /* no overflow check, because we know that the space is enough */
5842 *str++ = (char)c;
5843 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005844 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 else {
5846 Py_ssize_t unicodepos = p-startp;
5847 Py_ssize_t requiredsize;
5848 PyObject *repunicode;
5849 Py_ssize_t repsize;
5850 Py_ssize_t newpos;
5851 Py_ssize_t respos;
5852 Py_UNICODE *uni2;
5853 /* startpos for collecting unencodable chars */
5854 const Py_UNICODE *collstart = p;
5855 const Py_UNICODE *collend = p;
5856 /* find all unecodable characters */
5857 while ((collend < endp) && ((*collend)>=limit))
5858 ++collend;
5859 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5860 if (known_errorHandler==-1) {
5861 if ((errors==NULL) || (!strcmp(errors, "strict")))
5862 known_errorHandler = 1;
5863 else if (!strcmp(errors, "replace"))
5864 known_errorHandler = 2;
5865 else if (!strcmp(errors, "ignore"))
5866 known_errorHandler = 3;
5867 else if (!strcmp(errors, "xmlcharrefreplace"))
5868 known_errorHandler = 4;
5869 else
5870 known_errorHandler = 0;
5871 }
5872 switch (known_errorHandler) {
5873 case 1: /* strict */
5874 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5875 goto onError;
5876 case 2: /* replace */
5877 while (collstart++<collend)
5878 *str++ = '?'; /* fall through */
5879 case 3: /* ignore */
5880 p = collend;
5881 break;
5882 case 4: /* xmlcharrefreplace */
5883 respos = str - PyBytes_AS_STRING(res);
5884 /* determine replacement size (temporarily (mis)uses p) */
5885 for (p = collstart, repsize = 0; p < collend; ++p) {
5886 if (*p<10)
5887 repsize += 2+1+1;
5888 else if (*p<100)
5889 repsize += 2+2+1;
5890 else if (*p<1000)
5891 repsize += 2+3+1;
5892 else if (*p<10000)
5893 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005894#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 else
5896 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005897#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005898 else if (*p<100000)
5899 repsize += 2+5+1;
5900 else if (*p<1000000)
5901 repsize += 2+6+1;
5902 else
5903 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005904#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 }
5906 requiredsize = respos+repsize+(endp-collend);
5907 if (requiredsize > ressize) {
5908 if (requiredsize<2*ressize)
5909 requiredsize = 2*ressize;
5910 if (_PyBytes_Resize(&res, requiredsize))
5911 goto onError;
5912 str = PyBytes_AS_STRING(res) + respos;
5913 ressize = requiredsize;
5914 }
5915 /* generate replacement (temporarily (mis)uses p) */
5916 for (p = collstart; p < collend; ++p) {
5917 str += sprintf(str, "&#%d;", (int)*p);
5918 }
5919 p = collend;
5920 break;
5921 default:
5922 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5923 encoding, reason, startp, size, &exc,
5924 collstart-startp, collend-startp, &newpos);
5925 if (repunicode == NULL)
5926 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005927 if (PyBytes_Check(repunicode)) {
5928 /* Directly copy bytes result to output. */
5929 repsize = PyBytes_Size(repunicode);
5930 if (repsize > 1) {
5931 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005932 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005933 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5934 Py_DECREF(repunicode);
5935 goto onError;
5936 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005937 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005938 ressize += repsize-1;
5939 }
5940 memcpy(str, PyBytes_AsString(repunicode), repsize);
5941 str += repsize;
5942 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005943 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005944 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005945 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 /* need more space? (at least enough for what we
5947 have+the replacement+the rest of the string, so
5948 we won't have to check space for encodable characters) */
5949 respos = str - PyBytes_AS_STRING(res);
5950 repsize = PyUnicode_GET_SIZE(repunicode);
5951 requiredsize = respos+repsize+(endp-collend);
5952 if (requiredsize > ressize) {
5953 if (requiredsize<2*ressize)
5954 requiredsize = 2*ressize;
5955 if (_PyBytes_Resize(&res, requiredsize)) {
5956 Py_DECREF(repunicode);
5957 goto onError;
5958 }
5959 str = PyBytes_AS_STRING(res) + respos;
5960 ressize = requiredsize;
5961 }
5962 /* check if there is anything unencodable in the replacement
5963 and copy it to the output */
5964 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5965 c = *uni2;
5966 if (c >= limit) {
5967 raise_encode_exception(&exc, encoding, startp, size,
5968 unicodepos, unicodepos+1, reason);
5969 Py_DECREF(repunicode);
5970 goto onError;
5971 }
5972 *str = (char)c;
5973 }
5974 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005975 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005976 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005977 }
5978 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005979 /* Resize if we allocated to much */
5980 size = str - PyBytes_AS_STRING(res);
5981 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005982 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005983 if (_PyBytes_Resize(&res, size) < 0)
5984 goto onError;
5985 }
5986
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987 Py_XDECREF(errorHandler);
5988 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005989 return res;
5990
5991 onError:
5992 Py_XDECREF(res);
5993 Py_XDECREF(errorHandler);
5994 Py_XDECREF(exc);
5995 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996}
5997
Alexander Belopolsky40018472011-02-26 01:02:56 +00005998PyObject *
5999PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006000 Py_ssize_t size,
6001 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006003 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004}
6005
Alexander Belopolsky40018472011-02-26 01:02:56 +00006006PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006007_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008{
6009 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 PyErr_BadArgument();
6011 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006013 if (PyUnicode_READY(unicode) == -1)
6014 return NULL;
6015 /* Fast path: if it is a one-byte string, construct
6016 bytes object directly. */
6017 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6018 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6019 PyUnicode_GET_LENGTH(unicode));
6020 /* Non-Latin-1 characters present. Defer to above function to
6021 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006024 errors);
6025}
6026
6027PyObject*
6028PyUnicode_AsLatin1String(PyObject *unicode)
6029{
6030 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031}
6032
6033/* --- 7-bit ASCII Codec -------------------------------------------------- */
6034
Alexander Belopolsky40018472011-02-26 01:02:56 +00006035PyObject *
6036PyUnicode_DecodeASCII(const char *s,
6037 Py_ssize_t size,
6038 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006040 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 PyUnicodeObject *v;
6042 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006043 Py_ssize_t startinpos;
6044 Py_ssize_t endinpos;
6045 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006046 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006047 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006048 PyObject *errorHandler = NULL;
6049 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006050 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006051
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006053 if (size == 1 && *(unsigned char*)s < 128)
6054 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6055
6056 /* Fast path. Assume the input actually *is* ASCII, and allocate
6057 a single-block Unicode object with that assumption. If there is
6058 an error, drop the object and start over. */
6059 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6060 if (v == NULL)
6061 goto onError;
6062 d = PyUnicode_1BYTE_DATA(v);
6063 for (i = 0; i < size; i++) {
6064 unsigned char ch = ((unsigned char*)s)[i];
6065 if (ch < 128)
6066 d[i] = ch;
6067 else
6068 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006070 if (i == size)
6071 return (PyObject*)v;
6072 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006073
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 v = _PyUnicode_New(size);
6075 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080 e = s + size;
6081 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 register unsigned char c = (unsigned char)*s;
6083 if (c < 128) {
6084 *p++ = c;
6085 ++s;
6086 }
6087 else {
6088 startinpos = s-starts;
6089 endinpos = startinpos + 1;
6090 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6091 if (unicode_decode_call_errorhandler(
6092 errors, &errorHandler,
6093 "ascii", "ordinal not in range(128)",
6094 &starts, &e, &startinpos, &endinpos, &exc, &s,
6095 &v, &outpos, &p))
6096 goto onError;
6097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006099 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6101 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006102 Py_XDECREF(errorHandler);
6103 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006104 if (PyUnicode_READY(v) == -1) {
6105 Py_DECREF(v);
6106 return NULL;
6107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006109
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 Py_XDECREF(errorHandler);
6113 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 return NULL;
6115}
6116
Alexander Belopolsky40018472011-02-26 01:02:56 +00006117PyObject *
6118PyUnicode_EncodeASCII(const Py_UNICODE *p,
6119 Py_ssize_t size,
6120 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123}
6124
Alexander Belopolsky40018472011-02-26 01:02:56 +00006125PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006126_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127{
6128 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 PyErr_BadArgument();
6130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006132 if (PyUnicode_READY(unicode) == -1)
6133 return NULL;
6134 /* Fast path: if it is an ASCII-only string, construct bytes object
6135 directly. Else defer to above function to raise the exception. */
6136 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6137 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6138 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006141 errors);
6142}
6143
6144PyObject *
6145PyUnicode_AsASCIIString(PyObject *unicode)
6146{
6147 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148}
6149
Victor Stinner99b95382011-07-04 14:23:54 +02006150#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006151
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006152/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006153
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006154#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006155#define NEED_RETRY
6156#endif
6157
6158/* XXX This code is limited to "true" double-byte encodings, as
6159 a) it assumes an incomplete character consists of a single byte, and
6160 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006162
Alexander Belopolsky40018472011-02-26 01:02:56 +00006163static int
6164is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006165{
6166 const char *curr = s + offset;
6167
6168 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 const char *prev = CharPrev(s, curr);
6170 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006171 }
6172 return 0;
6173}
6174
6175/*
6176 * Decode MBCS string into unicode object. If 'final' is set, converts
6177 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6178 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006179static int
6180decode_mbcs(PyUnicodeObject **v,
6181 const char *s, /* MBCS string */
6182 int size, /* sizeof MBCS string */
6183 int final,
6184 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006185{
6186 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006187 Py_ssize_t n;
6188 DWORD usize;
6189 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006190
6191 assert(size >= 0);
6192
Victor Stinner554f3f02010-06-16 23:33:54 +00006193 /* check and handle 'errors' arg */
6194 if (errors==NULL || strcmp(errors, "strict")==0)
6195 flags = MB_ERR_INVALID_CHARS;
6196 else if (strcmp(errors, "ignore")==0)
6197 flags = 0;
6198 else {
6199 PyErr_Format(PyExc_ValueError,
6200 "mbcs encoding does not support errors='%s'",
6201 errors);
6202 return -1;
6203 }
6204
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006205 /* Skip trailing lead-byte unless 'final' is set */
6206 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006208
6209 /* First get the size of the result */
6210 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006211 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6212 if (usize==0)
6213 goto mbcs_decode_error;
6214 } else
6215 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006216
6217 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 /* Create unicode object */
6219 *v = _PyUnicode_New(usize);
6220 if (*v == NULL)
6221 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006222 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006223 }
6224 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006225 /* Extend unicode object */
6226 n = PyUnicode_GET_SIZE(*v);
6227 if (_PyUnicode_Resize(v, n + usize) < 0)
6228 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006229 }
6230
6231 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006232 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006234 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6235 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006237 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006238 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006239
6240mbcs_decode_error:
6241 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6242 we raise a UnicodeDecodeError - else it is a 'generic'
6243 windows error
6244 */
6245 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6246 /* Ideally, we should get reason from FormatMessage - this
6247 is the Windows 2000 English version of the message
6248 */
6249 PyObject *exc = NULL;
6250 const char *reason = "No mapping for the Unicode character exists "
6251 "in the target multi-byte code page.";
6252 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6253 if (exc != NULL) {
6254 PyCodec_StrictErrors(exc);
6255 Py_DECREF(exc);
6256 }
6257 } else {
6258 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6259 }
6260 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006261}
6262
Alexander Belopolsky40018472011-02-26 01:02:56 +00006263PyObject *
6264PyUnicode_DecodeMBCSStateful(const char *s,
6265 Py_ssize_t size,
6266 const char *errors,
6267 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006268{
6269 PyUnicodeObject *v = NULL;
6270 int done;
6271
6272 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006274
6275#ifdef NEED_RETRY
6276 retry:
6277 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006278 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006279 else
6280#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006281 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006282
6283 if (done < 0) {
6284 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006286 }
6287
6288 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006290
6291#ifdef NEED_RETRY
6292 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 s += done;
6294 size -= done;
6295 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006296 }
6297#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006298 if (PyUnicode_READY(v) == -1) {
6299 Py_DECREF(v);
6300 return NULL;
6301 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006302 return (PyObject *)v;
6303}
6304
Alexander Belopolsky40018472011-02-26 01:02:56 +00006305PyObject *
6306PyUnicode_DecodeMBCS(const char *s,
6307 Py_ssize_t size,
6308 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006309{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006310 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6311}
6312
6313/*
6314 * Convert unicode into string object (MBCS).
6315 * Returns 0 if succeed, -1 otherwise.
6316 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006317static int
6318encode_mbcs(PyObject **repr,
6319 const Py_UNICODE *p, /* unicode */
6320 int size, /* size of unicode */
6321 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006322{
Victor Stinner554f3f02010-06-16 23:33:54 +00006323 BOOL usedDefaultChar = FALSE;
6324 BOOL *pusedDefaultChar;
6325 int mbcssize;
6326 Py_ssize_t n;
6327 PyObject *exc = NULL;
6328 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006329
6330 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006331
Victor Stinner554f3f02010-06-16 23:33:54 +00006332 /* check and handle 'errors' arg */
6333 if (errors==NULL || strcmp(errors, "strict")==0) {
6334 flags = WC_NO_BEST_FIT_CHARS;
6335 pusedDefaultChar = &usedDefaultChar;
6336 } else if (strcmp(errors, "replace")==0) {
6337 flags = 0;
6338 pusedDefaultChar = NULL;
6339 } else {
6340 PyErr_Format(PyExc_ValueError,
6341 "mbcs encoding does not support errors='%s'",
6342 errors);
6343 return -1;
6344 }
6345
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006346 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006347 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006348 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6349 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 if (mbcssize == 0) {
6351 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6352 return -1;
6353 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006354 /* If we used a default char, then we failed! */
6355 if (pusedDefaultChar && *pusedDefaultChar)
6356 goto mbcs_encode_error;
6357 } else {
6358 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006359 }
6360
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006361 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006362 /* Create string object */
6363 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6364 if (*repr == NULL)
6365 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006366 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006367 }
6368 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 /* Extend string object */
6370 n = PyBytes_Size(*repr);
6371 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6372 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006373 }
6374
6375 /* Do the conversion */
6376 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006378 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6379 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6381 return -1;
6382 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006383 if (pusedDefaultChar && *pusedDefaultChar)
6384 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006385 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006386 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006387
6388mbcs_encode_error:
6389 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6390 Py_XDECREF(exc);
6391 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006392}
6393
Alexander Belopolsky40018472011-02-26 01:02:56 +00006394PyObject *
6395PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6396 Py_ssize_t size,
6397 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006398{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006399 PyObject *repr = NULL;
6400 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006401
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006402#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006404 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006405 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006406 else
6407#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006408 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006409
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006410 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 Py_XDECREF(repr);
6412 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006413 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006414
6415#ifdef NEED_RETRY
6416 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 p += INT_MAX;
6418 size -= INT_MAX;
6419 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006420 }
6421#endif
6422
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006423 return repr;
6424}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006425
Alexander Belopolsky40018472011-02-26 01:02:56 +00006426PyObject *
6427PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006428{
6429 if (!PyUnicode_Check(unicode)) {
6430 PyErr_BadArgument();
6431 return NULL;
6432 }
6433 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 PyUnicode_GET_SIZE(unicode),
6435 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006436}
6437
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006438#undef NEED_RETRY
6439
Victor Stinner99b95382011-07-04 14:23:54 +02006440#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006441
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442/* --- Character Mapping Codec -------------------------------------------- */
6443
Alexander Belopolsky40018472011-02-26 01:02:56 +00006444PyObject *
6445PyUnicode_DecodeCharmap(const char *s,
6446 Py_ssize_t size,
6447 PyObject *mapping,
6448 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006451 Py_ssize_t startinpos;
6452 Py_ssize_t endinpos;
6453 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006454 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 PyUnicodeObject *v;
6456 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006457 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006458 PyObject *errorHandler = NULL;
6459 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006460 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006461 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006462
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 /* Default to Latin-1 */
6464 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466
6467 v = _PyUnicode_New(size);
6468 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006473 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006474 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 mapstring = PyUnicode_AS_UNICODE(mapping);
6476 maplen = PyUnicode_GET_SIZE(mapping);
6477 while (s < e) {
6478 unsigned char ch = *s;
6479 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 if (ch < maplen)
6482 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 if (x == 0xfffe) {
6485 /* undefined mapping */
6486 outpos = p-PyUnicode_AS_UNICODE(v);
6487 startinpos = s-starts;
6488 endinpos = startinpos+1;
6489 if (unicode_decode_call_errorhandler(
6490 errors, &errorHandler,
6491 "charmap", "character maps to <undefined>",
6492 &starts, &e, &startinpos, &endinpos, &exc, &s,
6493 &v, &outpos, &p)) {
6494 goto onError;
6495 }
6496 continue;
6497 }
6498 *p++ = x;
6499 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006500 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006501 }
6502 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 while (s < e) {
6504 unsigned char ch = *s;
6505 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006506
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6508 w = PyLong_FromLong((long)ch);
6509 if (w == NULL)
6510 goto onError;
6511 x = PyObject_GetItem(mapping, w);
6512 Py_DECREF(w);
6513 if (x == NULL) {
6514 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6515 /* No mapping found means: mapping is undefined. */
6516 PyErr_Clear();
6517 x = Py_None;
6518 Py_INCREF(x);
6519 } else
6520 goto onError;
6521 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006522
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 /* Apply mapping */
6524 if (PyLong_Check(x)) {
6525 long value = PyLong_AS_LONG(x);
6526 if (value < 0 || value > 65535) {
6527 PyErr_SetString(PyExc_TypeError,
6528 "character mapping must be in range(65536)");
6529 Py_DECREF(x);
6530 goto onError;
6531 }
6532 *p++ = (Py_UNICODE)value;
6533 }
6534 else if (x == Py_None) {
6535 /* undefined mapping */
6536 outpos = p-PyUnicode_AS_UNICODE(v);
6537 startinpos = s-starts;
6538 endinpos = startinpos+1;
6539 if (unicode_decode_call_errorhandler(
6540 errors, &errorHandler,
6541 "charmap", "character maps to <undefined>",
6542 &starts, &e, &startinpos, &endinpos, &exc, &s,
6543 &v, &outpos, &p)) {
6544 Py_DECREF(x);
6545 goto onError;
6546 }
6547 Py_DECREF(x);
6548 continue;
6549 }
6550 else if (PyUnicode_Check(x)) {
6551 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006552
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 if (targetsize == 1)
6554 /* 1-1 mapping */
6555 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006556
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 else if (targetsize > 1) {
6558 /* 1-n mapping */
6559 if (targetsize > extrachars) {
6560 /* resize first */
6561 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6562 Py_ssize_t needed = (targetsize - extrachars) + \
6563 (targetsize << 2);
6564 extrachars += needed;
6565 /* XXX overflow detection missing */
6566 if (_PyUnicode_Resize(&v,
6567 PyUnicode_GET_SIZE(v) + needed) < 0) {
6568 Py_DECREF(x);
6569 goto onError;
6570 }
6571 p = PyUnicode_AS_UNICODE(v) + oldpos;
6572 }
6573 Py_UNICODE_COPY(p,
6574 PyUnicode_AS_UNICODE(x),
6575 targetsize);
6576 p += targetsize;
6577 extrachars -= targetsize;
6578 }
6579 /* 1-0 mapping: skip the character */
6580 }
6581 else {
6582 /* wrong return value */
6583 PyErr_SetString(PyExc_TypeError,
6584 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006585 Py_DECREF(x);
6586 goto onError;
6587 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 Py_DECREF(x);
6589 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 }
6592 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6594 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006595 Py_XDECREF(errorHandler);
6596 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006597 if (PyUnicode_READY(v) == -1) {
6598 Py_DECREF(v);
6599 return NULL;
6600 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006602
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006604 Py_XDECREF(errorHandler);
6605 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 Py_XDECREF(v);
6607 return NULL;
6608}
6609
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006610/* Charmap encoding: the lookup table */
6611
Alexander Belopolsky40018472011-02-26 01:02:56 +00006612struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 PyObject_HEAD
6614 unsigned char level1[32];
6615 int count2, count3;
6616 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006617};
6618
6619static PyObject*
6620encoding_map_size(PyObject *obj, PyObject* args)
6621{
6622 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006623 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006625}
6626
6627static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006628 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 PyDoc_STR("Return the size (in bytes) of this object") },
6630 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006631};
6632
6633static void
6634encoding_map_dealloc(PyObject* o)
6635{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006636 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006637}
6638
6639static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006640 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 "EncodingMap", /*tp_name*/
6642 sizeof(struct encoding_map), /*tp_basicsize*/
6643 0, /*tp_itemsize*/
6644 /* methods */
6645 encoding_map_dealloc, /*tp_dealloc*/
6646 0, /*tp_print*/
6647 0, /*tp_getattr*/
6648 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006649 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 0, /*tp_repr*/
6651 0, /*tp_as_number*/
6652 0, /*tp_as_sequence*/
6653 0, /*tp_as_mapping*/
6654 0, /*tp_hash*/
6655 0, /*tp_call*/
6656 0, /*tp_str*/
6657 0, /*tp_getattro*/
6658 0, /*tp_setattro*/
6659 0, /*tp_as_buffer*/
6660 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6661 0, /*tp_doc*/
6662 0, /*tp_traverse*/
6663 0, /*tp_clear*/
6664 0, /*tp_richcompare*/
6665 0, /*tp_weaklistoffset*/
6666 0, /*tp_iter*/
6667 0, /*tp_iternext*/
6668 encoding_map_methods, /*tp_methods*/
6669 0, /*tp_members*/
6670 0, /*tp_getset*/
6671 0, /*tp_base*/
6672 0, /*tp_dict*/
6673 0, /*tp_descr_get*/
6674 0, /*tp_descr_set*/
6675 0, /*tp_dictoffset*/
6676 0, /*tp_init*/
6677 0, /*tp_alloc*/
6678 0, /*tp_new*/
6679 0, /*tp_free*/
6680 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006681};
6682
6683PyObject*
6684PyUnicode_BuildEncodingMap(PyObject* string)
6685{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006686 PyObject *result;
6687 struct encoding_map *mresult;
6688 int i;
6689 int need_dict = 0;
6690 unsigned char level1[32];
6691 unsigned char level2[512];
6692 unsigned char *mlevel1, *mlevel2, *mlevel3;
6693 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006694 int kind;
6695 void *data;
6696 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006698 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006699 PyErr_BadArgument();
6700 return NULL;
6701 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006702 kind = PyUnicode_KIND(string);
6703 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006704 memset(level1, 0xFF, sizeof level1);
6705 memset(level2, 0xFF, sizeof level2);
6706
6707 /* If there isn't a one-to-one mapping of NULL to \0,
6708 or if there are non-BMP characters, we need to use
6709 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006710 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006711 need_dict = 1;
6712 for (i = 1; i < 256; i++) {
6713 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006714 ch = PyUnicode_READ(kind, data, i);
6715 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006716 need_dict = 1;
6717 break;
6718 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006719 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006720 /* unmapped character */
6721 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006722 l1 = ch >> 11;
6723 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006724 if (level1[l1] == 0xFF)
6725 level1[l1] = count2++;
6726 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006727 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006728 }
6729
6730 if (count2 >= 0xFF || count3 >= 0xFF)
6731 need_dict = 1;
6732
6733 if (need_dict) {
6734 PyObject *result = PyDict_New();
6735 PyObject *key, *value;
6736 if (!result)
6737 return NULL;
6738 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006739 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006740 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006741 if (!key || !value)
6742 goto failed1;
6743 if (PyDict_SetItem(result, key, value) == -1)
6744 goto failed1;
6745 Py_DECREF(key);
6746 Py_DECREF(value);
6747 }
6748 return result;
6749 failed1:
6750 Py_XDECREF(key);
6751 Py_XDECREF(value);
6752 Py_DECREF(result);
6753 return NULL;
6754 }
6755
6756 /* Create a three-level trie */
6757 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6758 16*count2 + 128*count3 - 1);
6759 if (!result)
6760 return PyErr_NoMemory();
6761 PyObject_Init(result, &EncodingMapType);
6762 mresult = (struct encoding_map*)result;
6763 mresult->count2 = count2;
6764 mresult->count3 = count3;
6765 mlevel1 = mresult->level1;
6766 mlevel2 = mresult->level23;
6767 mlevel3 = mresult->level23 + 16*count2;
6768 memcpy(mlevel1, level1, 32);
6769 memset(mlevel2, 0xFF, 16*count2);
6770 memset(mlevel3, 0, 128*count3);
6771 count3 = 0;
6772 for (i = 1; i < 256; i++) {
6773 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006774 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006775 /* unmapped character */
6776 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006777 o1 = PyUnicode_READ(kind, data, i)>>11;
6778 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006779 i2 = 16*mlevel1[o1] + o2;
6780 if (mlevel2[i2] == 0xFF)
6781 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006782 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006783 i3 = 128*mlevel2[i2] + o3;
6784 mlevel3[i3] = i;
6785 }
6786 return result;
6787}
6788
6789static int
6790encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6791{
6792 struct encoding_map *map = (struct encoding_map*)mapping;
6793 int l1 = c>>11;
6794 int l2 = (c>>7) & 0xF;
6795 int l3 = c & 0x7F;
6796 int i;
6797
6798#ifdef Py_UNICODE_WIDE
6799 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006801 }
6802#endif
6803 if (c == 0)
6804 return 0;
6805 /* level 1*/
6806 i = map->level1[l1];
6807 if (i == 0xFF) {
6808 return -1;
6809 }
6810 /* level 2*/
6811 i = map->level23[16*i+l2];
6812 if (i == 0xFF) {
6813 return -1;
6814 }
6815 /* level 3 */
6816 i = map->level23[16*map->count2 + 128*i + l3];
6817 if (i == 0) {
6818 return -1;
6819 }
6820 return i;
6821}
6822
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006823/* Lookup the character ch in the mapping. If the character
6824 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006825 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006826static PyObject *
6827charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828{
Christian Heimes217cfd12007-12-02 14:31:20 +00006829 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006830 PyObject *x;
6831
6832 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006834 x = PyObject_GetItem(mapping, w);
6835 Py_DECREF(w);
6836 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6838 /* No mapping found means: mapping is undefined. */
6839 PyErr_Clear();
6840 x = Py_None;
6841 Py_INCREF(x);
6842 return x;
6843 } else
6844 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006846 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006848 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 long value = PyLong_AS_LONG(x);
6850 if (value < 0 || value > 255) {
6851 PyErr_SetString(PyExc_TypeError,
6852 "character mapping must be in range(256)");
6853 Py_DECREF(x);
6854 return NULL;
6855 }
6856 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006858 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 /* wrong return value */
6862 PyErr_Format(PyExc_TypeError,
6863 "character mapping must return integer, bytes or None, not %.400s",
6864 x->ob_type->tp_name);
6865 Py_DECREF(x);
6866 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 }
6868}
6869
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006870static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006871charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006872{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006873 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6874 /* exponentially overallocate to minimize reallocations */
6875 if (requiredsize < 2*outsize)
6876 requiredsize = 2*outsize;
6877 if (_PyBytes_Resize(outobj, requiredsize))
6878 return -1;
6879 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006880}
6881
Benjamin Peterson14339b62009-01-31 16:36:08 +00006882typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006884} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006885/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006886 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006887 space is available. Return a new reference to the object that
6888 was put in the output buffer, or Py_None, if the mapping was undefined
6889 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006890 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006891static charmapencode_result
6892charmapencode_output(Py_UNICODE c, PyObject *mapping,
6893 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006894{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006895 PyObject *rep;
6896 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006897 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898
Christian Heimes90aa7642007-12-19 02:45:37 +00006899 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006900 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006902 if (res == -1)
6903 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 if (outsize<requiredsize)
6905 if (charmapencode_resize(outobj, outpos, requiredsize))
6906 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006907 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006908 outstart[(*outpos)++] = (char)res;
6909 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006910 }
6911
6912 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006915 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006916 Py_DECREF(rep);
6917 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006918 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 if (PyLong_Check(rep)) {
6920 Py_ssize_t requiredsize = *outpos+1;
6921 if (outsize<requiredsize)
6922 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6923 Py_DECREF(rep);
6924 return enc_EXCEPTION;
6925 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006926 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006928 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 else {
6930 const char *repchars = PyBytes_AS_STRING(rep);
6931 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6932 Py_ssize_t requiredsize = *outpos+repsize;
6933 if (outsize<requiredsize)
6934 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6935 Py_DECREF(rep);
6936 return enc_EXCEPTION;
6937 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006938 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 memcpy(outstart + *outpos, repchars, repsize);
6940 *outpos += repsize;
6941 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006942 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006943 Py_DECREF(rep);
6944 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006945}
6946
6947/* handle an error in PyUnicode_EncodeCharmap
6948 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006949static int
6950charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006951 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006952 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006953 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006954 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006955{
6956 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006957 Py_ssize_t repsize;
6958 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006959 Py_UNICODE *uni2;
6960 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006961 Py_ssize_t collstartpos = *inpos;
6962 Py_ssize_t collendpos = *inpos+1;
6963 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006964 char *encoding = "charmap";
6965 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006966 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006967
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006968 /* find all unencodable characters */
6969 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006970 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006971 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 int res = encoding_map_lookup(p[collendpos], mapping);
6973 if (res != -1)
6974 break;
6975 ++collendpos;
6976 continue;
6977 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006978
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 rep = charmapencode_lookup(p[collendpos], mapping);
6980 if (rep==NULL)
6981 return -1;
6982 else if (rep!=Py_None) {
6983 Py_DECREF(rep);
6984 break;
6985 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006986 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006987 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006988 }
6989 /* cache callback name lookup
6990 * (if not done yet, i.e. it's the first error) */
6991 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006992 if ((errors==NULL) || (!strcmp(errors, "strict")))
6993 *known_errorHandler = 1;
6994 else if (!strcmp(errors, "replace"))
6995 *known_errorHandler = 2;
6996 else if (!strcmp(errors, "ignore"))
6997 *known_errorHandler = 3;
6998 else if (!strcmp(errors, "xmlcharrefreplace"))
6999 *known_errorHandler = 4;
7000 else
7001 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007002 }
7003 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007004 case 1: /* strict */
7005 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7006 return -1;
7007 case 2: /* replace */
7008 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 x = charmapencode_output('?', mapping, res, respos);
7010 if (x==enc_EXCEPTION) {
7011 return -1;
7012 }
7013 else if (x==enc_FAILED) {
7014 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7015 return -1;
7016 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007017 }
7018 /* fall through */
7019 case 3: /* ignore */
7020 *inpos = collendpos;
7021 break;
7022 case 4: /* xmlcharrefreplace */
7023 /* generate replacement (temporarily (mis)uses p) */
7024 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 char buffer[2+29+1+1];
7026 char *cp;
7027 sprintf(buffer, "&#%d;", (int)p[collpos]);
7028 for (cp = buffer; *cp; ++cp) {
7029 x = charmapencode_output(*cp, mapping, res, respos);
7030 if (x==enc_EXCEPTION)
7031 return -1;
7032 else if (x==enc_FAILED) {
7033 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7034 return -1;
7035 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007036 }
7037 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007038 *inpos = collendpos;
7039 break;
7040 default:
7041 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 encoding, reason, p, size, exceptionObject,
7043 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007044 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007046 if (PyBytes_Check(repunicode)) {
7047 /* Directly copy bytes result to output. */
7048 Py_ssize_t outsize = PyBytes_Size(*res);
7049 Py_ssize_t requiredsize;
7050 repsize = PyBytes_Size(repunicode);
7051 requiredsize = *respos + repsize;
7052 if (requiredsize > outsize)
7053 /* Make room for all additional bytes. */
7054 if (charmapencode_resize(res, respos, requiredsize)) {
7055 Py_DECREF(repunicode);
7056 return -1;
7057 }
7058 memcpy(PyBytes_AsString(*res) + *respos,
7059 PyBytes_AsString(repunicode), repsize);
7060 *respos += repsize;
7061 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007062 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007063 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007064 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007065 /* generate replacement */
7066 repsize = PyUnicode_GET_SIZE(repunicode);
7067 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 x = charmapencode_output(*uni2, mapping, res, respos);
7069 if (x==enc_EXCEPTION) {
7070 return -1;
7071 }
7072 else if (x==enc_FAILED) {
7073 Py_DECREF(repunicode);
7074 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7075 return -1;
7076 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007077 }
7078 *inpos = newpos;
7079 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007080 }
7081 return 0;
7082}
7083
Alexander Belopolsky40018472011-02-26 01:02:56 +00007084PyObject *
7085PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7086 Py_ssize_t size,
7087 PyObject *mapping,
7088 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007090 /* output object */
7091 PyObject *res = NULL;
7092 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007093 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007094 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007095 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007096 PyObject *errorHandler = NULL;
7097 PyObject *exc = NULL;
7098 /* the following variable is used for caching string comparisons
7099 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7100 * 3=ignore, 4=xmlcharrefreplace */
7101 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102
7103 /* Default to Latin-1 */
7104 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007107 /* allocate enough for a simple encoding without
7108 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007109 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007110 if (res == NULL)
7111 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007112 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007115 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007116 /* try to encode it */
7117 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7118 if (x==enc_EXCEPTION) /* error */
7119 goto onError;
7120 if (x==enc_FAILED) { /* unencodable character */
7121 if (charmap_encoding_error(p, size, &inpos, mapping,
7122 &exc,
7123 &known_errorHandler, &errorHandler, errors,
7124 &res, &respos)) {
7125 goto onError;
7126 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007127 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 else
7129 /* done with this character => adjust input position */
7130 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007133 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007134 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007135 if (_PyBytes_Resize(&res, respos) < 0)
7136 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007137
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007138 Py_XDECREF(exc);
7139 Py_XDECREF(errorHandler);
7140 return res;
7141
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007143 Py_XDECREF(res);
7144 Py_XDECREF(exc);
7145 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146 return NULL;
7147}
7148
Alexander Belopolsky40018472011-02-26 01:02:56 +00007149PyObject *
7150PyUnicode_AsCharmapString(PyObject *unicode,
7151 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152{
7153 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 PyErr_BadArgument();
7155 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156 }
7157 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007158 PyUnicode_GET_SIZE(unicode),
7159 mapping,
7160 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161}
7162
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007163/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007164static void
7165make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007166 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007167 Py_ssize_t startpos, Py_ssize_t endpos,
7168 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007170 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007171 *exceptionObject = _PyUnicodeTranslateError_Create(
7172 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 }
7174 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007175 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7176 goto onError;
7177 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7178 goto onError;
7179 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7180 goto onError;
7181 return;
7182 onError:
7183 Py_DECREF(*exceptionObject);
7184 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 }
7186}
7187
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007188/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007189static void
7190raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007191 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007192 Py_ssize_t startpos, Py_ssize_t endpos,
7193 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007194{
7195 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007196 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007197 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199}
7200
7201/* error handling callback helper:
7202 build arguments, call the callback and check the arguments,
7203 put the result into newpos and return the replacement string, which
7204 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007205static PyObject *
7206unicode_translate_call_errorhandler(const char *errors,
7207 PyObject **errorHandler,
7208 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007209 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007210 Py_ssize_t startpos, Py_ssize_t endpos,
7211 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007212{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007213 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007214
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007215 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007216 PyObject *restuple;
7217 PyObject *resunicode;
7218
7219 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007221 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007222 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007223 }
7224
7225 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007226 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007227 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007228 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007229
7230 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007232 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007234 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007235 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 Py_DECREF(restuple);
7237 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007238 }
7239 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 &resunicode, &i_newpos)) {
7241 Py_DECREF(restuple);
7242 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007243 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007244 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007245 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007246 else
7247 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007248 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007249 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7250 Py_DECREF(restuple);
7251 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007252 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007253 Py_INCREF(resunicode);
7254 Py_DECREF(restuple);
7255 return resunicode;
7256}
7257
7258/* Lookup the character ch in the mapping and put the result in result,
7259 which must be decrefed by the caller.
7260 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007261static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007262charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007263{
Christian Heimes217cfd12007-12-02 14:31:20 +00007264 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007265 PyObject *x;
7266
7267 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007269 x = PyObject_GetItem(mapping, w);
7270 Py_DECREF(w);
7271 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7273 /* No mapping found means: use 1:1 mapping. */
7274 PyErr_Clear();
7275 *result = NULL;
7276 return 0;
7277 } else
7278 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007279 }
7280 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007281 *result = x;
7282 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007283 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007284 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007285 long value = PyLong_AS_LONG(x);
7286 long max = PyUnicode_GetMax();
7287 if (value < 0 || value > max) {
7288 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007289 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007290 Py_DECREF(x);
7291 return -1;
7292 }
7293 *result = x;
7294 return 0;
7295 }
7296 else if (PyUnicode_Check(x)) {
7297 *result = x;
7298 return 0;
7299 }
7300 else {
7301 /* wrong return value */
7302 PyErr_SetString(PyExc_TypeError,
7303 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007304 Py_DECREF(x);
7305 return -1;
7306 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007307}
7308/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 if not reallocate and adjust various state variables.
7310 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007311static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007312charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007315 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007316 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 /* exponentially overallocate to minimize reallocations */
7318 if (requiredsize < 2 * oldsize)
7319 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007320 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7321 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007323 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007324 }
7325 return 0;
7326}
7327/* lookup the character, put the result in the output string and adjust
7328 various state variables. Return a new reference to the object that
7329 was put in the output buffer in *result, or Py_None, if the mapping was
7330 undefined (in which case no character was written).
7331 The called must decref result.
7332 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007333static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007334charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7335 PyObject *mapping, Py_UCS4 **output,
7336 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007337 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007338{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007339 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7340 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007342 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007344 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007345 }
7346 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007348 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007350 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007351 }
7352 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007353 Py_ssize_t repsize;
7354 if (PyUnicode_READY(*res) == -1)
7355 return -1;
7356 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 if (repsize==1) {
7358 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007359 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 }
7361 else if (repsize!=0) {
7362 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007363 Py_ssize_t requiredsize = *opos +
7364 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007366 Py_ssize_t i;
7367 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007369 for(i = 0; i < repsize; i++)
7370 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007372 }
7373 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007375 return 0;
7376}
7377
Alexander Belopolsky40018472011-02-26 01:02:56 +00007378PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007379_PyUnicode_TranslateCharmap(PyObject *input,
7380 PyObject *mapping,
7381 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007383 /* input object */
7384 char *idata;
7385 Py_ssize_t size, i;
7386 int kind;
7387 /* output buffer */
7388 Py_UCS4 *output = NULL;
7389 Py_ssize_t osize;
7390 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007391 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007392 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007393 char *reason = "character maps to <undefined>";
7394 PyObject *errorHandler = NULL;
7395 PyObject *exc = NULL;
7396 /* the following variable is used for caching string comparisons
7397 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7398 * 3=ignore, 4=xmlcharrefreplace */
7399 int known_errorHandler = -1;
7400
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 PyErr_BadArgument();
7403 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007406 if (PyUnicode_READY(input) == -1)
7407 return NULL;
7408 idata = (char*)PyUnicode_DATA(input);
7409 kind = PyUnicode_KIND(input);
7410 size = PyUnicode_GET_LENGTH(input);
7411 i = 0;
7412
7413 if (size == 0) {
7414 Py_INCREF(input);
7415 return input;
7416 }
7417
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007418 /* allocate enough for a simple 1:1 translation without
7419 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007420 osize = size;
7421 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7422 opos = 0;
7423 if (output == NULL) {
7424 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007426 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007428 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 /* try to encode it */
7430 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007431 if (charmaptranslate_output(input, i, mapping,
7432 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 Py_XDECREF(x);
7434 goto onError;
7435 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007436 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007438 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 else { /* untranslatable character */
7440 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7441 Py_ssize_t repsize;
7442 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007443 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007445 Py_ssize_t collstart = i;
7446 Py_ssize_t collend = i+1;
7447 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007450 while (collend < size) {
7451 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 goto onError;
7453 Py_XDECREF(x);
7454 if (x!=Py_None)
7455 break;
7456 ++collend;
7457 }
7458 /* cache callback name lookup
7459 * (if not done yet, i.e. it's the first error) */
7460 if (known_errorHandler==-1) {
7461 if ((errors==NULL) || (!strcmp(errors, "strict")))
7462 known_errorHandler = 1;
7463 else if (!strcmp(errors, "replace"))
7464 known_errorHandler = 2;
7465 else if (!strcmp(errors, "ignore"))
7466 known_errorHandler = 3;
7467 else if (!strcmp(errors, "xmlcharrefreplace"))
7468 known_errorHandler = 4;
7469 else
7470 known_errorHandler = 0;
7471 }
7472 switch (known_errorHandler) {
7473 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007474 raise_translate_exception(&exc, input, collstart,
7475 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007476 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 case 2: /* replace */
7478 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007479 for (coll = collstart; coll<collend; coll++)
7480 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 /* fall through */
7482 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007483 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 break;
7485 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007486 /* generate replacement (temporarily (mis)uses i) */
7487 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 char buffer[2+29+1+1];
7489 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007490 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7491 if (charmaptranslate_makespace(&output, &osize,
7492 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 goto onError;
7494 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007495 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007497 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 break;
7499 default:
7500 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007501 reason, input, &exc,
7502 collstart, collend, &newpos);
7503 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007504 goto onError;
7505 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007506 repsize = PyUnicode_GET_LENGTH(repunicode);
7507 if (charmaptranslate_makespace(&output, &osize,
7508 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 Py_DECREF(repunicode);
7510 goto onError;
7511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007512 for (uni2 = 0; repsize-->0; ++uni2)
7513 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7514 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007516 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007517 }
7518 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007519 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7520 if (!res)
7521 goto onError;
7522 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007523 Py_XDECREF(exc);
7524 Py_XDECREF(errorHandler);
7525 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007528 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007529 Py_XDECREF(exc);
7530 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 return NULL;
7532}
7533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007534/* Deprecated. Use PyUnicode_Translate instead. */
7535PyObject *
7536PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7537 Py_ssize_t size,
7538 PyObject *mapping,
7539 const char *errors)
7540{
7541 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7542 if (!unicode)
7543 return NULL;
7544 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7545}
7546
Alexander Belopolsky40018472011-02-26 01:02:56 +00007547PyObject *
7548PyUnicode_Translate(PyObject *str,
7549 PyObject *mapping,
7550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551{
7552 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007553
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554 str = PyUnicode_FromObject(str);
7555 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007557 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558 Py_DECREF(str);
7559 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007560
Benjamin Peterson29060642009-01-31 22:14:21 +00007561 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 Py_XDECREF(str);
7563 return NULL;
7564}
Tim Petersced69f82003-09-16 20:30:58 +00007565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007566static Py_UCS4
7567fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7568{
7569 /* No need to call PyUnicode_READY(self) because this function is only
7570 called as a callback from fixup() which does it already. */
7571 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7572 const int kind = PyUnicode_KIND(self);
7573 void *data = PyUnicode_DATA(self);
7574 Py_UCS4 maxchar = 0, ch, fixed;
7575 Py_ssize_t i;
7576
7577 for (i = 0; i < len; ++i) {
7578 ch = PyUnicode_READ(kind, data, i);
7579 fixed = 0;
7580 if (ch > 127) {
7581 if (Py_UNICODE_ISSPACE(ch))
7582 fixed = ' ';
7583 else {
7584 const int decimal = Py_UNICODE_TODECIMAL(ch);
7585 if (decimal >= 0)
7586 fixed = '0' + decimal;
7587 }
7588 if (fixed != 0) {
7589 if (fixed > maxchar)
7590 maxchar = fixed;
7591 PyUnicode_WRITE(kind, data, i, fixed);
7592 }
7593 else if (ch > maxchar)
7594 maxchar = ch;
7595 }
7596 else if (ch > maxchar)
7597 maxchar = ch;
7598 }
7599
7600 return maxchar;
7601}
7602
7603PyObject *
7604_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7605{
7606 if (!PyUnicode_Check(unicode)) {
7607 PyErr_BadInternalCall();
7608 return NULL;
7609 }
7610 if (PyUnicode_READY(unicode) == -1)
7611 return NULL;
7612 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7613 /* If the string is already ASCII, just return the same string */
7614 Py_INCREF(unicode);
7615 return unicode;
7616 }
7617 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7618}
7619
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007620PyObject *
7621PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7622 Py_ssize_t length)
7623{
7624 PyObject *result;
7625 Py_UNICODE *p; /* write pointer into result */
7626 Py_ssize_t i;
7627 /* Copy to a new string */
7628 result = (PyObject *)_PyUnicode_New(length);
7629 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7630 if (result == NULL)
7631 return result;
7632 p = PyUnicode_AS_UNICODE(result);
7633 /* Iterate over code points */
7634 for (i = 0; i < length; i++) {
7635 Py_UNICODE ch =s[i];
7636 if (ch > 127) {
7637 int decimal = Py_UNICODE_TODECIMAL(ch);
7638 if (decimal >= 0)
7639 p[i] = '0' + decimal;
7640 }
7641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007642 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7643 Py_DECREF(result);
7644 return NULL;
7645 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007646 return result;
7647}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007648/* --- Decimal Encoder ---------------------------------------------------- */
7649
Alexander Belopolsky40018472011-02-26 01:02:56 +00007650int
7651PyUnicode_EncodeDecimal(Py_UNICODE *s,
7652 Py_ssize_t length,
7653 char *output,
7654 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007655{
7656 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007657 PyObject *errorHandler = NULL;
7658 PyObject *exc = NULL;
7659 const char *encoding = "decimal";
7660 const char *reason = "invalid decimal Unicode string";
7661 /* the following variable is used for caching string comparisons
7662 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7663 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007664
7665 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007666 PyErr_BadArgument();
7667 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007668 }
7669
7670 p = s;
7671 end = s + length;
7672 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 register Py_UNICODE ch = *p;
7674 int decimal;
7675 PyObject *repunicode;
7676 Py_ssize_t repsize;
7677 Py_ssize_t newpos;
7678 Py_UNICODE *uni2;
7679 Py_UNICODE *collstart;
7680 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007681
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007683 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 ++p;
7685 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007686 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 decimal = Py_UNICODE_TODECIMAL(ch);
7688 if (decimal >= 0) {
7689 *output++ = '0' + decimal;
7690 ++p;
7691 continue;
7692 }
7693 if (0 < ch && ch < 256) {
7694 *output++ = (char)ch;
7695 ++p;
7696 continue;
7697 }
7698 /* All other characters are considered unencodable */
7699 collstart = p;
7700 collend = p+1;
7701 while (collend < end) {
7702 if ((0 < *collend && *collend < 256) ||
7703 !Py_UNICODE_ISSPACE(*collend) ||
7704 Py_UNICODE_TODECIMAL(*collend))
7705 break;
7706 }
7707 /* cache callback name lookup
7708 * (if not done yet, i.e. it's the first error) */
7709 if (known_errorHandler==-1) {
7710 if ((errors==NULL) || (!strcmp(errors, "strict")))
7711 known_errorHandler = 1;
7712 else if (!strcmp(errors, "replace"))
7713 known_errorHandler = 2;
7714 else if (!strcmp(errors, "ignore"))
7715 known_errorHandler = 3;
7716 else if (!strcmp(errors, "xmlcharrefreplace"))
7717 known_errorHandler = 4;
7718 else
7719 known_errorHandler = 0;
7720 }
7721 switch (known_errorHandler) {
7722 case 1: /* strict */
7723 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7724 goto onError;
7725 case 2: /* replace */
7726 for (p = collstart; p < collend; ++p)
7727 *output++ = '?';
7728 /* fall through */
7729 case 3: /* ignore */
7730 p = collend;
7731 break;
7732 case 4: /* xmlcharrefreplace */
7733 /* generate replacement (temporarily (mis)uses p) */
7734 for (p = collstart; p < collend; ++p)
7735 output += sprintf(output, "&#%d;", (int)*p);
7736 p = collend;
7737 break;
7738 default:
7739 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7740 encoding, reason, s, length, &exc,
7741 collstart-s, collend-s, &newpos);
7742 if (repunicode == NULL)
7743 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007744 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007745 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007746 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7747 Py_DECREF(repunicode);
7748 goto onError;
7749 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 /* generate replacement */
7751 repsize = PyUnicode_GET_SIZE(repunicode);
7752 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7753 Py_UNICODE ch = *uni2;
7754 if (Py_UNICODE_ISSPACE(ch))
7755 *output++ = ' ';
7756 else {
7757 decimal = Py_UNICODE_TODECIMAL(ch);
7758 if (decimal >= 0)
7759 *output++ = '0' + decimal;
7760 else if (0 < ch && ch < 256)
7761 *output++ = (char)ch;
7762 else {
7763 Py_DECREF(repunicode);
7764 raise_encode_exception(&exc, encoding,
7765 s, length, collstart-s, collend-s, reason);
7766 goto onError;
7767 }
7768 }
7769 }
7770 p = s + newpos;
7771 Py_DECREF(repunicode);
7772 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007773 }
7774 /* 0-terminate the output string */
7775 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007776 Py_XDECREF(exc);
7777 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007778 return 0;
7779
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007781 Py_XDECREF(exc);
7782 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007783 return -1;
7784}
7785
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786/* --- Helpers ------------------------------------------------------------ */
7787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007788#include "stringlib/ucs1lib.h"
7789#include "stringlib/fastsearch.h"
7790#include "stringlib/partition.h"
7791#include "stringlib/split.h"
7792#include "stringlib/count.h"
7793#include "stringlib/find.h"
7794#include "stringlib/localeutil.h"
7795#include "stringlib/undef.h"
7796
7797#include "stringlib/ucs2lib.h"
7798#include "stringlib/fastsearch.h"
7799#include "stringlib/partition.h"
7800#include "stringlib/split.h"
7801#include "stringlib/count.h"
7802#include "stringlib/find.h"
7803#include "stringlib/localeutil.h"
7804#include "stringlib/undef.h"
7805
7806#include "stringlib/ucs4lib.h"
7807#include "stringlib/fastsearch.h"
7808#include "stringlib/partition.h"
7809#include "stringlib/split.h"
7810#include "stringlib/count.h"
7811#include "stringlib/find.h"
7812#include "stringlib/localeutil.h"
7813#include "stringlib/undef.h"
7814
7815static Py_ssize_t
7816any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7817 const Py_UCS1*, Py_ssize_t,
7818 Py_ssize_t, Py_ssize_t),
7819 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7820 const Py_UCS2*, Py_ssize_t,
7821 Py_ssize_t, Py_ssize_t),
7822 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7823 const Py_UCS4*, Py_ssize_t,
7824 Py_ssize_t, Py_ssize_t),
7825 PyObject* s1, PyObject* s2,
7826 Py_ssize_t start,
7827 Py_ssize_t end)
7828{
7829 int kind1, kind2, kind;
7830 void *buf1, *buf2;
7831 Py_ssize_t len1, len2, result;
7832
7833 kind1 = PyUnicode_KIND(s1);
7834 kind2 = PyUnicode_KIND(s2);
7835 kind = kind1 > kind2 ? kind1 : kind2;
7836 buf1 = PyUnicode_DATA(s1);
7837 buf2 = PyUnicode_DATA(s2);
7838 if (kind1 != kind)
7839 buf1 = _PyUnicode_AsKind(s1, kind);
7840 if (!buf1)
7841 return -2;
7842 if (kind2 != kind)
7843 buf2 = _PyUnicode_AsKind(s2, kind);
7844 if (!buf2) {
7845 if (kind1 != kind) PyMem_Free(buf1);
7846 return -2;
7847 }
7848 len1 = PyUnicode_GET_LENGTH(s1);
7849 len2 = PyUnicode_GET_LENGTH(s2);
7850
7851 switch(kind) {
7852 case PyUnicode_1BYTE_KIND:
7853 result = ucs1(buf1, len1, buf2, len2, start, end);
7854 break;
7855 case PyUnicode_2BYTE_KIND:
7856 result = ucs2(buf1, len1, buf2, len2, start, end);
7857 break;
7858 case PyUnicode_4BYTE_KIND:
7859 result = ucs4(buf1, len1, buf2, len2, start, end);
7860 break;
7861 default:
7862 assert(0); result = -2;
7863 }
7864
7865 if (kind1 != kind)
7866 PyMem_Free(buf1);
7867 if (kind2 != kind)
7868 PyMem_Free(buf2);
7869
7870 return result;
7871}
7872
7873Py_ssize_t
7874_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7875 Py_ssize_t n_buffer,
7876 void *digits, Py_ssize_t n_digits,
7877 Py_ssize_t min_width,
7878 const char *grouping,
7879 const char *thousands_sep)
7880{
7881 switch(kind) {
7882 case PyUnicode_1BYTE_KIND:
7883 return _PyUnicode_ucs1_InsertThousandsGrouping(
7884 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7885 min_width, grouping, thousands_sep);
7886 case PyUnicode_2BYTE_KIND:
7887 return _PyUnicode_ucs2_InsertThousandsGrouping(
7888 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7889 min_width, grouping, thousands_sep);
7890 case PyUnicode_4BYTE_KIND:
7891 return _PyUnicode_ucs4_InsertThousandsGrouping(
7892 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7893 min_width, grouping, thousands_sep);
7894 }
7895 assert(0);
7896 return -1;
7897}
7898
7899
Eric Smith8c663262007-08-25 02:26:07 +00007900#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007901#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007902
Thomas Wouters477c8d52006-05-27 19:21:47 +00007903#include "stringlib/count.h"
7904#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007905
Thomas Wouters477c8d52006-05-27 19:21:47 +00007906/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007907#define ADJUST_INDICES(start, end, len) \
7908 if (end > len) \
7909 end = len; \
7910 else if (end < 0) { \
7911 end += len; \
7912 if (end < 0) \
7913 end = 0; \
7914 } \
7915 if (start < 0) { \
7916 start += len; \
7917 if (start < 0) \
7918 start = 0; \
7919 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007920
Alexander Belopolsky40018472011-02-26 01:02:56 +00007921Py_ssize_t
7922PyUnicode_Count(PyObject *str,
7923 PyObject *substr,
7924 Py_ssize_t start,
7925 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007927 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007928 PyUnicodeObject* str_obj;
7929 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007930 int kind1, kind2, kind;
7931 void *buf1 = NULL, *buf2 = NULL;
7932 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007933
Thomas Wouters477c8d52006-05-27 19:21:47 +00007934 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007935 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007937 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02007938 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 Py_DECREF(str_obj);
7940 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007941 }
Tim Petersced69f82003-09-16 20:30:58 +00007942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007943 kind1 = PyUnicode_KIND(str_obj);
7944 kind2 = PyUnicode_KIND(sub_obj);
7945 kind = kind1 > kind2 ? kind1 : kind2;
7946 buf1 = PyUnicode_DATA(str_obj);
7947 if (kind1 != kind)
7948 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7949 if (!buf1)
7950 goto onError;
7951 buf2 = PyUnicode_DATA(sub_obj);
7952 if (kind2 != kind)
7953 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7954 if (!buf2)
7955 goto onError;
7956 len1 = PyUnicode_GET_LENGTH(str_obj);
7957 len2 = PyUnicode_GET_LENGTH(sub_obj);
7958
7959 ADJUST_INDICES(start, end, len1);
7960 switch(kind) {
7961 case PyUnicode_1BYTE_KIND:
7962 result = ucs1lib_count(
7963 ((Py_UCS1*)buf1) + start, end - start,
7964 buf2, len2, PY_SSIZE_T_MAX
7965 );
7966 break;
7967 case PyUnicode_2BYTE_KIND:
7968 result = ucs2lib_count(
7969 ((Py_UCS2*)buf1) + start, end - start,
7970 buf2, len2, PY_SSIZE_T_MAX
7971 );
7972 break;
7973 case PyUnicode_4BYTE_KIND:
7974 result = ucs4lib_count(
7975 ((Py_UCS4*)buf1) + start, end - start,
7976 buf2, len2, PY_SSIZE_T_MAX
7977 );
7978 break;
7979 default:
7980 assert(0); result = 0;
7981 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007982
7983 Py_DECREF(sub_obj);
7984 Py_DECREF(str_obj);
7985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007986 if (kind1 != kind)
7987 PyMem_Free(buf1);
7988 if (kind2 != kind)
7989 PyMem_Free(buf2);
7990
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007992 onError:
7993 Py_DECREF(sub_obj);
7994 Py_DECREF(str_obj);
7995 if (kind1 != kind && buf1)
7996 PyMem_Free(buf1);
7997 if (kind2 != kind && buf2)
7998 PyMem_Free(buf2);
7999 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000}
8001
Alexander Belopolsky40018472011-02-26 01:02:56 +00008002Py_ssize_t
8003PyUnicode_Find(PyObject *str,
8004 PyObject *sub,
8005 Py_ssize_t start,
8006 Py_ssize_t end,
8007 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008009 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008010
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008012 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008014 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008015 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 Py_DECREF(str);
8017 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 }
Tim Petersced69f82003-09-16 20:30:58 +00008019
Thomas Wouters477c8d52006-05-27 19:21:47 +00008020 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008021 result = any_find_slice(
8022 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8023 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008024 );
8025 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 result = any_find_slice(
8027 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8028 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008029 );
8030
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008032 Py_DECREF(sub);
8033
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034 return result;
8035}
8036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008037Py_ssize_t
8038PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8039 Py_ssize_t start, Py_ssize_t end,
8040 int direction)
8041{
8042 char *result;
8043 int kind;
8044 if (PyUnicode_READY(str) == -1)
8045 return -2;
8046 if (end > PyUnicode_GET_LENGTH(str))
8047 end = PyUnicode_GET_LENGTH(str);
8048 kind = PyUnicode_KIND(str);
8049 result = findchar(PyUnicode_1BYTE_DATA(str)
8050 + PyUnicode_KIND_SIZE(kind, start),
8051 kind,
8052 end-start, ch, direction);
8053 if (!result)
8054 return -1;
8055 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8056}
8057
Alexander Belopolsky40018472011-02-26 01:02:56 +00008058static int
8059tailmatch(PyUnicodeObject *self,
8060 PyUnicodeObject *substring,
8061 Py_ssize_t start,
8062 Py_ssize_t end,
8063 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008065 int kind_self;
8066 int kind_sub;
8067 void *data_self;
8068 void *data_sub;
8069 Py_ssize_t offset;
8070 Py_ssize_t i;
8071 Py_ssize_t end_sub;
8072
8073 if (PyUnicode_READY(self) == -1 ||
8074 PyUnicode_READY(substring) == -1)
8075 return 0;
8076
8077 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078 return 1;
8079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008080 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8081 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008085 kind_self = PyUnicode_KIND(self);
8086 data_self = PyUnicode_DATA(self);
8087 kind_sub = PyUnicode_KIND(substring);
8088 data_sub = PyUnicode_DATA(substring);
8089 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8090
8091 if (direction > 0)
8092 offset = end;
8093 else
8094 offset = start;
8095
8096 if (PyUnicode_READ(kind_self, data_self, offset) ==
8097 PyUnicode_READ(kind_sub, data_sub, 0) &&
8098 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8099 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8100 /* If both are of the same kind, memcmp is sufficient */
8101 if (kind_self == kind_sub) {
8102 return ! memcmp((char *)data_self +
8103 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8104 data_sub,
8105 PyUnicode_GET_LENGTH(substring) *
8106 PyUnicode_CHARACTER_SIZE(substring));
8107 }
8108 /* otherwise we have to compare each character by first accesing it */
8109 else {
8110 /* We do not need to compare 0 and len(substring)-1 because
8111 the if statement above ensured already that they are equal
8112 when we end up here. */
8113 // TODO: honor direction and do a forward or backwards search
8114 for (i = 1; i < end_sub; ++i) {
8115 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8116 PyUnicode_READ(kind_sub, data_sub, i))
8117 return 0;
8118 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008120 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121 }
8122
8123 return 0;
8124}
8125
Alexander Belopolsky40018472011-02-26 01:02:56 +00008126Py_ssize_t
8127PyUnicode_Tailmatch(PyObject *str,
8128 PyObject *substr,
8129 Py_ssize_t start,
8130 Py_ssize_t end,
8131 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008133 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008134
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 str = PyUnicode_FromObject(str);
8136 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 substr = PyUnicode_FromObject(substr);
8139 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008140 Py_DECREF(str);
8141 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 }
Tim Petersced69f82003-09-16 20:30:58 +00008143
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 (PyUnicodeObject *)substr,
8146 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 Py_DECREF(str);
8148 Py_DECREF(substr);
8149 return result;
8150}
8151
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152/* Apply fixfct filter to the Unicode object self and return a
8153 reference to the modified object */
8154
Alexander Belopolsky40018472011-02-26 01:02:56 +00008155static PyObject *
8156fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008157 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159 PyObject *u;
8160 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008162 if (PyUnicode_READY(self) == -1)
8163 return NULL;
8164 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8165 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8166 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008170 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8171 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008173 /* fix functions return the new maximum character in a string,
8174 if the kind of the resulting unicode object does not change,
8175 everything is fine. Otherwise we need to change the string kind
8176 and re-run the fix function. */
8177 maxchar_new = fixfct((PyUnicodeObject*)u);
8178 if (maxchar_new == 0)
8179 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8180 else if (maxchar_new <= 127)
8181 maxchar_new = 127;
8182 else if (maxchar_new <= 255)
8183 maxchar_new = 255;
8184 else if (maxchar_new <= 65535)
8185 maxchar_new = 65535;
8186 else
8187 maxchar_new = 1114111; /* 0x10ffff */
8188
8189 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 /* fixfct should return TRUE if it modified the buffer. If
8191 FALSE, return a reference to the original buffer instead
8192 (to save space, not time) */
8193 Py_INCREF(self);
8194 Py_DECREF(u);
8195 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008197 else if (maxchar_new == maxchar_old) {
8198 return u;
8199 }
8200 else {
8201 /* In case the maximum character changed, we need to
8202 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008203 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008204 if (v == NULL) {
8205 Py_DECREF(u);
8206 return NULL;
8207 }
8208 if (maxchar_new > maxchar_old) {
8209 /* If the maxchar increased so that the kind changed, not all
8210 characters are representable anymore and we need to fix the
8211 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008212 if (PyUnicode_CopyCharacters(v, 0,
8213 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008214 PyUnicode_GET_LENGTH(self)) < 0)
8215 {
8216 Py_DECREF(u);
8217 return NULL;
8218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008219 maxchar_old = fixfct((PyUnicodeObject*)v);
8220 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8221 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008222 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008223 if (PyUnicode_CopyCharacters(v, 0,
8224 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008225 PyUnicode_GET_LENGTH(self)) < 0)
8226 {
8227 Py_DECREF(u);
8228 return NULL;
8229 }
8230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008231
8232 Py_DECREF(u);
8233 return v;
8234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235}
8236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008237static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008238fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008240 /* No need to call PyUnicode_READY(self) because this function is only
8241 called as a callback from fixup() which does it already. */
8242 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8243 const int kind = PyUnicode_KIND(self);
8244 void *data = PyUnicode_DATA(self);
8245 int touched = 0;
8246 Py_UCS4 maxchar = 0;
8247 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008249 for (i = 0; i < len; ++i) {
8250 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8251 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8252 if (up != ch) {
8253 if (up > maxchar)
8254 maxchar = up;
8255 PyUnicode_WRITE(kind, data, i, up);
8256 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 else if (ch > maxchar)
8259 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 }
8261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008262 if (touched)
8263 return maxchar;
8264 else
8265 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266}
8267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008268static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008269fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8272 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8273 const int kind = PyUnicode_KIND(self);
8274 void *data = PyUnicode_DATA(self);
8275 int touched = 0;
8276 Py_UCS4 maxchar = 0;
8277 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008279 for(i = 0; i < len; ++i) {
8280 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8281 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8282 if (lo != ch) {
8283 if (lo > maxchar)
8284 maxchar = lo;
8285 PyUnicode_WRITE(kind, data, i, lo);
8286 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008288 else if (ch > maxchar)
8289 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 }
8291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008292 if (touched)
8293 return maxchar;
8294 else
8295 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296}
8297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008298static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008299fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008301 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8302 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8303 const int kind = PyUnicode_KIND(self);
8304 void *data = PyUnicode_DATA(self);
8305 int touched = 0;
8306 Py_UCS4 maxchar = 0;
8307 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008309 for(i = 0; i < len; ++i) {
8310 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8311 Py_UCS4 nu = 0;
8312
8313 if (Py_UNICODE_ISUPPER(ch))
8314 nu = Py_UNICODE_TOLOWER(ch);
8315 else if (Py_UNICODE_ISLOWER(ch))
8316 nu = Py_UNICODE_TOUPPER(ch);
8317
8318 if (nu != 0) {
8319 if (nu > maxchar)
8320 maxchar = nu;
8321 PyUnicode_WRITE(kind, data, i, nu);
8322 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324 else if (ch > maxchar)
8325 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 }
8327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008328 if (touched)
8329 return maxchar;
8330 else
8331 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332}
8333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008335fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008337 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8338 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8339 const int kind = PyUnicode_KIND(self);
8340 void *data = PyUnicode_DATA(self);
8341 int touched = 0;
8342 Py_UCS4 maxchar = 0;
8343 Py_ssize_t i = 0;
8344 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008345
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008346 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348
8349 ch = PyUnicode_READ(kind, data, i);
8350 if (!Py_UNICODE_ISUPPER(ch)) {
8351 maxchar = Py_UNICODE_TOUPPER(ch);
8352 PyUnicode_WRITE(kind, data, i, maxchar);
8353 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 ++i;
8356 for(; i < len; ++i) {
8357 ch = PyUnicode_READ(kind, data, i);
8358 if (!Py_UNICODE_ISLOWER(ch)) {
8359 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8360 if (lo > maxchar)
8361 maxchar = lo;
8362 PyUnicode_WRITE(kind, data, i, lo);
8363 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008364 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 else if (ch > maxchar)
8366 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368
8369 if (touched)
8370 return maxchar;
8371 else
8372 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373}
8374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008376fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8379 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8380 const int kind = PyUnicode_KIND(self);
8381 void *data = PyUnicode_DATA(self);
8382 Py_UCS4 maxchar = 0;
8383 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384 int previous_is_cased;
8385
8386 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 if (len == 1) {
8388 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8389 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8390 if (ti != ch) {
8391 PyUnicode_WRITE(kind, data, i, ti);
8392 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 }
8394 else
8395 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398 for(; i < len; ++i) {
8399 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8400 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008401
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008403 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008405 nu = Py_UNICODE_TOTITLE(ch);
8406
8407 if (nu > maxchar)
8408 maxchar = nu;
8409 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008410
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 if (Py_UNICODE_ISLOWER(ch) ||
8412 Py_UNICODE_ISUPPER(ch) ||
8413 Py_UNICODE_ISTITLE(ch))
8414 previous_is_cased = 1;
8415 else
8416 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008418 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419}
8420
Tim Peters8ce9f162004-08-27 01:49:32 +00008421PyObject *
8422PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008425 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008426 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008427 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008428 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8429 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008430 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008431 Py_ssize_t sz, i, res_offset;
8432 Py_UCS4 maxchar = 0;
8433 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434
Tim Peters05eba1f2004-08-27 21:32:02 +00008435 fseq = PySequence_Fast(seq, "");
8436 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008437 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008438 }
8439
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008440 /* NOTE: the following code can't call back into Python code,
8441 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008442 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008443
Tim Peters05eba1f2004-08-27 21:32:02 +00008444 seqlen = PySequence_Fast_GET_SIZE(fseq);
8445 /* If empty sequence, return u"". */
8446 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008448 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008449 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008450 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008451 /* If singleton sequence with an exact Unicode, return that. */
8452 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 item = items[0];
8454 if (PyUnicode_CheckExact(item)) {
8455 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008456 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 goto Done;
8458 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008459 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008460 else {
8461 /* Set up sep and seplen */
8462 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008463 /* fall back to a blank space separator */
8464 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008465 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008467 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008468 else {
8469 if (!PyUnicode_Check(separator)) {
8470 PyErr_Format(PyExc_TypeError,
8471 "separator: expected str instance,"
8472 " %.80s found",
8473 Py_TYPE(separator)->tp_name);
8474 goto onError;
8475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008476 if (PyUnicode_READY(separator) == -1)
8477 goto onError;
8478 sep = separator;
8479 seplen = PyUnicode_GET_LENGTH(separator);
8480 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8481 /* inc refcount to keep this code path symetric with the
8482 above case of a blank separator */
8483 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008484 }
8485 }
8486
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008487 /* There are at least two things to join, or else we have a subclass
8488 * of str in the sequence.
8489 * Do a pre-pass to figure out the total amount of space we'll
8490 * need (sz), and see whether all argument are strings.
8491 */
8492 sz = 0;
8493 for (i = 0; i < seqlen; i++) {
8494 const Py_ssize_t old_sz = sz;
8495 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 if (!PyUnicode_Check(item)) {
8497 PyErr_Format(PyExc_TypeError,
8498 "sequence item %zd: expected str instance,"
8499 " %.80s found",
8500 i, Py_TYPE(item)->tp_name);
8501 goto onError;
8502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 if (PyUnicode_READY(item) == -1)
8504 goto onError;
8505 sz += PyUnicode_GET_LENGTH(item);
8506 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8507 if (item_maxchar > maxchar)
8508 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008509 if (i != 0)
8510 sz += seplen;
8511 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8512 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008514 goto onError;
8515 }
8516 }
Tim Petersced69f82003-09-16 20:30:58 +00008517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008518 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008519 if (res == NULL)
8520 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008521
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008522 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008524 Py_ssize_t itemlen;
8525 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 /* Copy item, and maybe the separator. */
8528 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008529 if (PyUnicode_CopyCharacters(res, res_offset,
8530 sep, 0, seplen) < 0)
8531 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008534 if (PyUnicode_CopyCharacters(res, res_offset,
8535 item, 0, itemlen) < 0)
8536 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008538 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008540
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008542 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008543 Py_XDECREF(sep);
8544 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008547 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008549 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550 return NULL;
8551}
8552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553#define FILL(kind, data, value, start, length) \
8554 do { \
8555 Py_ssize_t i_ = 0; \
8556 assert(kind != PyUnicode_WCHAR_KIND); \
8557 switch ((kind)) { \
8558 case PyUnicode_1BYTE_KIND: { \
8559 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8560 memset(to_, (unsigned char)value, length); \
8561 break; \
8562 } \
8563 case PyUnicode_2BYTE_KIND: { \
8564 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8565 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8566 break; \
8567 } \
8568 default: { \
8569 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8570 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8571 break; \
8572 } \
8573 } \
8574 } while (0)
8575
Alexander Belopolsky40018472011-02-26 01:02:56 +00008576static PyUnicodeObject *
8577pad(PyUnicodeObject *self,
8578 Py_ssize_t left,
8579 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 PyObject *u;
8583 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008584 int kind;
8585 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586
8587 if (left < 0)
8588 left = 0;
8589 if (right < 0)
8590 right = 0;
8591
Tim Peters7a29bd52001-09-12 03:03:31 +00008592 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 Py_INCREF(self);
8594 return self;
8595 }
8596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008597 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8598 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008599 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8600 return NULL;
8601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8603 if (fill > maxchar)
8604 maxchar = fill;
8605 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008606 if (!u)
8607 return NULL;
8608
8609 kind = PyUnicode_KIND(u);
8610 data = PyUnicode_DATA(u);
8611 if (left)
8612 FILL(kind, data, fill, 0, left);
8613 if (right)
8614 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008615 if (PyUnicode_CopyCharacters(u, left,
8616 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008617 _PyUnicode_LENGTH(self)) < 0)
8618 {
8619 Py_DECREF(u);
8620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 }
8622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626
Alexander Belopolsky40018472011-02-26 01:02:56 +00008627PyObject *
8628PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631
8632 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 switch(PyUnicode_KIND(string)) {
8637 case PyUnicode_1BYTE_KIND:
8638 list = ucs1lib_splitlines(
8639 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8640 PyUnicode_GET_LENGTH(string), keepends);
8641 break;
8642 case PyUnicode_2BYTE_KIND:
8643 list = ucs2lib_splitlines(
8644 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8645 PyUnicode_GET_LENGTH(string), keepends);
8646 break;
8647 case PyUnicode_4BYTE_KIND:
8648 list = ucs4lib_splitlines(
8649 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8650 PyUnicode_GET_LENGTH(string), keepends);
8651 break;
8652 default:
8653 assert(0);
8654 list = 0;
8655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656 Py_DECREF(string);
8657 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658}
8659
Alexander Belopolsky40018472011-02-26 01:02:56 +00008660static PyObject *
8661split(PyUnicodeObject *self,
8662 PyUnicodeObject *substring,
8663 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 int kind1, kind2, kind;
8666 void *buf1, *buf2;
8667 Py_ssize_t len1, len2;
8668 PyObject* out;
8669
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008671 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673 if (PyUnicode_READY(self) == -1)
8674 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 if (substring == NULL)
8677 switch(PyUnicode_KIND(self)) {
8678 case PyUnicode_1BYTE_KIND:
8679 return ucs1lib_split_whitespace(
8680 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8681 PyUnicode_GET_LENGTH(self), maxcount
8682 );
8683 case PyUnicode_2BYTE_KIND:
8684 return ucs2lib_split_whitespace(
8685 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8686 PyUnicode_GET_LENGTH(self), maxcount
8687 );
8688 case PyUnicode_4BYTE_KIND:
8689 return ucs4lib_split_whitespace(
8690 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8691 PyUnicode_GET_LENGTH(self), maxcount
8692 );
8693 default:
8694 assert(0);
8695 return NULL;
8696 }
8697
8698 if (PyUnicode_READY(substring) == -1)
8699 return NULL;
8700
8701 kind1 = PyUnicode_KIND(self);
8702 kind2 = PyUnicode_KIND(substring);
8703 kind = kind1 > kind2 ? kind1 : kind2;
8704 buf1 = PyUnicode_DATA(self);
8705 buf2 = PyUnicode_DATA(substring);
8706 if (kind1 != kind)
8707 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8708 if (!buf1)
8709 return NULL;
8710 if (kind2 != kind)
8711 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8712 if (!buf2) {
8713 if (kind1 != kind) PyMem_Free(buf1);
8714 return NULL;
8715 }
8716 len1 = PyUnicode_GET_LENGTH(self);
8717 len2 = PyUnicode_GET_LENGTH(substring);
8718
8719 switch(kind) {
8720 case PyUnicode_1BYTE_KIND:
8721 out = ucs1lib_split(
8722 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8723 break;
8724 case PyUnicode_2BYTE_KIND:
8725 out = ucs2lib_split(
8726 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8727 break;
8728 case PyUnicode_4BYTE_KIND:
8729 out = ucs4lib_split(
8730 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8731 break;
8732 default:
8733 out = NULL;
8734 }
8735 if (kind1 != kind)
8736 PyMem_Free(buf1);
8737 if (kind2 != kind)
8738 PyMem_Free(buf2);
8739 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740}
8741
Alexander Belopolsky40018472011-02-26 01:02:56 +00008742static PyObject *
8743rsplit(PyUnicodeObject *self,
8744 PyUnicodeObject *substring,
8745 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008747 int kind1, kind2, kind;
8748 void *buf1, *buf2;
8749 Py_ssize_t len1, len2;
8750 PyObject* out;
8751
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008752 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008753 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 if (PyUnicode_READY(self) == -1)
8756 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758 if (substring == NULL)
8759 switch(PyUnicode_KIND(self)) {
8760 case PyUnicode_1BYTE_KIND:
8761 return ucs1lib_rsplit_whitespace(
8762 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8763 PyUnicode_GET_LENGTH(self), maxcount
8764 );
8765 case PyUnicode_2BYTE_KIND:
8766 return ucs2lib_rsplit_whitespace(
8767 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8768 PyUnicode_GET_LENGTH(self), maxcount
8769 );
8770 case PyUnicode_4BYTE_KIND:
8771 return ucs4lib_rsplit_whitespace(
8772 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8773 PyUnicode_GET_LENGTH(self), maxcount
8774 );
8775 default:
8776 assert(0);
8777 return NULL;
8778 }
8779
8780 if (PyUnicode_READY(substring) == -1)
8781 return NULL;
8782
8783 kind1 = PyUnicode_KIND(self);
8784 kind2 = PyUnicode_KIND(substring);
8785 kind = kind1 > kind2 ? kind1 : kind2;
8786 buf1 = PyUnicode_DATA(self);
8787 buf2 = PyUnicode_DATA(substring);
8788 if (kind1 != kind)
8789 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8790 if (!buf1)
8791 return NULL;
8792 if (kind2 != kind)
8793 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8794 if (!buf2) {
8795 if (kind1 != kind) PyMem_Free(buf1);
8796 return NULL;
8797 }
8798 len1 = PyUnicode_GET_LENGTH(self);
8799 len2 = PyUnicode_GET_LENGTH(substring);
8800
8801 switch(kind) {
8802 case PyUnicode_1BYTE_KIND:
8803 out = ucs1lib_rsplit(
8804 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8805 break;
8806 case PyUnicode_2BYTE_KIND:
8807 out = ucs2lib_rsplit(
8808 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8809 break;
8810 case PyUnicode_4BYTE_KIND:
8811 out = ucs4lib_rsplit(
8812 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8813 break;
8814 default:
8815 out = NULL;
8816 }
8817 if (kind1 != kind)
8818 PyMem_Free(buf1);
8819 if (kind2 != kind)
8820 PyMem_Free(buf2);
8821 return out;
8822}
8823
8824static Py_ssize_t
8825anylib_find(int kind, void *buf1, Py_ssize_t len1,
8826 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8827{
8828 switch(kind) {
8829 case PyUnicode_1BYTE_KIND:
8830 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8831 case PyUnicode_2BYTE_KIND:
8832 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8833 case PyUnicode_4BYTE_KIND:
8834 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8835 }
8836 assert(0);
8837 return -1;
8838}
8839
8840static Py_ssize_t
8841anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8842 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8843{
8844 switch(kind) {
8845 case PyUnicode_1BYTE_KIND:
8846 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8847 case PyUnicode_2BYTE_KIND:
8848 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8849 case PyUnicode_4BYTE_KIND:
8850 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8851 }
8852 assert(0);
8853 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008854}
8855
Alexander Belopolsky40018472011-02-26 01:02:56 +00008856static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857replace(PyObject *self, PyObject *str1,
8858 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008860 PyObject *u;
8861 char *sbuf = PyUnicode_DATA(self);
8862 char *buf1 = PyUnicode_DATA(str1);
8863 char *buf2 = PyUnicode_DATA(str2);
8864 int srelease = 0, release1 = 0, release2 = 0;
8865 int skind = PyUnicode_KIND(self);
8866 int kind1 = PyUnicode_KIND(str1);
8867 int kind2 = PyUnicode_KIND(str2);
8868 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8869 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8870 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871
8872 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008875 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008877 if (skind < kind1)
8878 /* substring too wide to be present */
8879 goto nothing;
8880
8881 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008882 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008883 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008885 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008887 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 Py_UCS4 u1, u2, maxchar;
8889 int mayshrink, rkind;
8890 u1 = PyUnicode_READ_CHAR(str1, 0);
8891 if (!findchar(sbuf, PyUnicode_KIND(self),
8892 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008893 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008894 u2 = PyUnicode_READ_CHAR(str2, 0);
8895 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8896 /* Replacing u1 with u2 may cause a maxchar reduction in the
8897 result string. */
8898 mayshrink = maxchar > 127;
8899 if (u2 > maxchar) {
8900 maxchar = u2;
8901 mayshrink = 0;
8902 }
8903 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008904 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008906 if (PyUnicode_CopyCharacters(u, 0,
8907 (PyObject*)self, 0, slen) < 0)
8908 {
8909 Py_DECREF(u);
8910 return NULL;
8911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008912 rkind = PyUnicode_KIND(u);
8913 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8914 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008915 if (--maxcount < 0)
8916 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919 if (mayshrink) {
8920 PyObject *tmp = u;
8921 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8922 PyUnicode_GET_LENGTH(tmp));
8923 Py_DECREF(tmp);
8924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 int rkind = skind;
8927 char *res;
8928 if (kind1 < rkind) {
8929 /* widen substring */
8930 buf1 = _PyUnicode_AsKind(str1, rkind);
8931 if (!buf1) goto error;
8932 release1 = 1;
8933 }
8934 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008935 if (i < 0)
8936 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937 if (rkind > kind2) {
8938 /* widen replacement */
8939 buf2 = _PyUnicode_AsKind(str2, rkind);
8940 if (!buf2) goto error;
8941 release2 = 1;
8942 }
8943 else if (rkind < kind2) {
8944 /* widen self and buf1 */
8945 rkind = kind2;
8946 if (release1) PyMem_Free(buf1);
8947 sbuf = _PyUnicode_AsKind(self, rkind);
8948 if (!sbuf) goto error;
8949 srelease = 1;
8950 buf1 = _PyUnicode_AsKind(str1, rkind);
8951 if (!buf1) goto error;
8952 release1 = 1;
8953 }
8954 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8955 if (!res) {
8956 PyErr_NoMemory();
8957 goto error;
8958 }
8959 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008960 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8962 buf2,
8963 PyUnicode_KIND_SIZE(rkind, len2));
8964 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008965
8966 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8968 slen-i,
8969 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008970 if (i == -1)
8971 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8973 buf2,
8974 PyUnicode_KIND_SIZE(rkind, len2));
8975 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008976 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977
8978 u = PyUnicode_FromKindAndData(rkind, res, slen);
8979 PyMem_Free(res);
8980 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984 Py_ssize_t n, i, j, ires;
8985 Py_ssize_t product, new_size;
8986 int rkind = skind;
8987 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 if (kind1 < rkind) {
8990 buf1 = _PyUnicode_AsKind(str1, rkind);
8991 if (!buf1) goto error;
8992 release1 = 1;
8993 }
8994 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008995 if (n == 0)
8996 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 if (kind2 < rkind) {
8998 buf2 = _PyUnicode_AsKind(str2, rkind);
8999 if (!buf2) goto error;
9000 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 else if (kind2 > rkind) {
9003 rkind = kind2;
9004 sbuf = _PyUnicode_AsKind(self, rkind);
9005 if (!sbuf) goto error;
9006 srelease = 1;
9007 if (release1) PyMem_Free(buf1);
9008 buf1 = _PyUnicode_AsKind(str1, rkind);
9009 if (!buf1) goto error;
9010 release1 = 1;
9011 }
9012 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9013 PyUnicode_GET_LENGTH(str1))); */
9014 product = n * (len2-len1);
9015 if ((product / (len2-len1)) != n) {
9016 PyErr_SetString(PyExc_OverflowError,
9017 "replace string is too long");
9018 goto error;
9019 }
9020 new_size = slen + product;
9021 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9022 PyErr_SetString(PyExc_OverflowError,
9023 "replace string is too long");
9024 goto error;
9025 }
9026 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9027 if (!res)
9028 goto error;
9029 ires = i = 0;
9030 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009031 while (n-- > 0) {
9032 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009033 j = anylib_find(rkind,
9034 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9035 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009036 if (j == -1)
9037 break;
9038 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009039 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9041 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9042 PyUnicode_KIND_SIZE(rkind, j-i));
9043 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009044 }
9045 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 if (len2 > 0) {
9047 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9048 buf2,
9049 PyUnicode_KIND_SIZE(rkind, len2));
9050 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009055 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9057 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9058 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009059 } else {
9060 /* interleave */
9061 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9063 buf2,
9064 PyUnicode_KIND_SIZE(rkind, len2));
9065 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009066 if (--n <= 0)
9067 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9069 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9070 PyUnicode_KIND_SIZE(rkind, 1));
9071 ires++;
9072 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9075 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9076 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 if (srelease)
9081 PyMem_FREE(sbuf);
9082 if (release1)
9083 PyMem_FREE(buf1);
9084 if (release2)
9085 PyMem_FREE(buf2);
9086 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009087
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009089 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 if (srelease)
9091 PyMem_FREE(sbuf);
9092 if (release1)
9093 PyMem_FREE(buf1);
9094 if (release2)
9095 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009096 if (PyUnicode_CheckExact(self)) {
9097 Py_INCREF(self);
9098 return (PyObject *) self;
9099 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009100 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 error:
9102 if (srelease && sbuf)
9103 PyMem_FREE(sbuf);
9104 if (release1 && buf1)
9105 PyMem_FREE(buf1);
9106 if (release2 && buf2)
9107 PyMem_FREE(buf2);
9108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109}
9110
9111/* --- Unicode Object Methods --------------------------------------------- */
9112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009113PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009114 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115\n\
9116Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009117characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118
9119static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009120unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122 return fixup(self, fixtitle);
9123}
9124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009125PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127\n\
9128Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009129have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130
9131static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009132unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134 return fixup(self, fixcapitalize);
9135}
9136
9137#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009138PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140\n\
9141Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009142normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009143
9144static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009145unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146{
9147 PyObject *list;
9148 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009149 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151 /* Split into words */
9152 list = split(self, NULL, -1);
9153 if (!list)
9154 return NULL;
9155
9156 /* Capitalize each word */
9157 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9158 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160 if (item == NULL)
9161 goto onError;
9162 Py_DECREF(PyList_GET_ITEM(list, i));
9163 PyList_SET_ITEM(list, i, item);
9164 }
9165
9166 /* Join the words to form a new string */
9167 item = PyUnicode_Join(NULL, list);
9168
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170 Py_DECREF(list);
9171 return (PyObject *)item;
9172}
9173#endif
9174
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009175/* Argument converter. Coerces to a single unicode character */
9176
9177static int
9178convert_uc(PyObject *obj, void *addr)
9179{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009181 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009182
Benjamin Peterson14339b62009-01-31 16:36:08 +00009183 uniobj = PyUnicode_FromObject(obj);
9184 if (uniobj == NULL) {
9185 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009187 return 0;
9188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009190 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009192 Py_DECREF(uniobj);
9193 return 0;
9194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009196 Py_DECREF(uniobj);
9197 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009198}
9199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009200PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009201 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009203Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009204done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205
9206static PyObject *
9207unicode_center(PyUnicodeObject *self, PyObject *args)
9208{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009209 Py_ssize_t marg, left;
9210 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009211 Py_UCS4 fillchar = ' ';
9212
Victor Stinnere9a29352011-10-01 02:14:59 +02009213 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215
Victor Stinnere9a29352011-10-01 02:14:59 +02009216 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217 return NULL;
9218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220 Py_INCREF(self);
9221 return (PyObject*) self;
9222 }
9223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225 left = marg / 2 + (marg & width & 1);
9226
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009227 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228}
9229
Marc-André Lemburge5034372000-08-08 08:04:29 +00009230#if 0
9231
9232/* This code should go into some future Unicode collation support
9233 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009234 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009235
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009236/* speedy UTF-16 code point order comparison */
9237/* gleaned from: */
9238/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9239
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009240static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009241{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009242 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009243 0, 0, 0, 0, 0, 0, 0, 0,
9244 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009245 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009246};
9247
Guido van Rossumd57fd912000-03-10 22:53:23 +00009248static int
9249unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9250{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009251 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009252
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253 Py_UNICODE *s1 = str1->str;
9254 Py_UNICODE *s2 = str2->str;
9255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 len1 = str1->_base._base.length;
9257 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009258
Guido van Rossumd57fd912000-03-10 22:53:23 +00009259 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009260 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009261
9262 c1 = *s1++;
9263 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009264
Benjamin Peterson29060642009-01-31 22:14:21 +00009265 if (c1 > (1<<11) * 26)
9266 c1 += utf16Fixup[c1>>11];
9267 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009268 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009269 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009270
9271 if (c1 != c2)
9272 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009273
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009274 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275 }
9276
9277 return (len1 < len2) ? -1 : (len1 != len2);
9278}
9279
Marc-André Lemburge5034372000-08-08 08:04:29 +00009280#else
9281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282/* This function assumes that str1 and str2 are readied by the caller. */
9283
Marc-André Lemburge5034372000-08-08 08:04:29 +00009284static int
9285unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 int kind1, kind2;
9288 void *data1, *data2;
9289 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 kind1 = PyUnicode_KIND(str1);
9292 kind2 = PyUnicode_KIND(str2);
9293 data1 = PyUnicode_DATA(str1);
9294 data2 = PyUnicode_DATA(str2);
9295 len1 = PyUnicode_GET_LENGTH(str1);
9296 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 for (i = 0; i < len1 && i < len2; ++i) {
9299 Py_UCS4 c1, c2;
9300 c1 = PyUnicode_READ(kind1, data1, i);
9301 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009302
9303 if (c1 != c2)
9304 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009305 }
9306
9307 return (len1 < len2) ? -1 : (len1 != len2);
9308}
9309
9310#endif
9311
Alexander Belopolsky40018472011-02-26 01:02:56 +00009312int
9313PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9316 if (PyUnicode_READY(left) == -1 ||
9317 PyUnicode_READY(right) == -1)
9318 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009319 return unicode_compare((PyUnicodeObject *)left,
9320 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009322 PyErr_Format(PyExc_TypeError,
9323 "Can't compare %.100s and %.100s",
9324 left->ob_type->tp_name,
9325 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326 return -1;
9327}
9328
Martin v. Löwis5b222132007-06-10 09:51:05 +00009329int
9330PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9331{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332 Py_ssize_t i;
9333 int kind;
9334 void *data;
9335 Py_UCS4 chr;
9336
Martin v. Löwis5b222132007-06-10 09:51:05 +00009337 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 if (PyUnicode_READY(uni) == -1)
9339 return -1;
9340 kind = PyUnicode_KIND(uni);
9341 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009342 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9344 if (chr != str[i])
9345 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009346 /* This check keeps Python strings that end in '\0' from comparing equal
9347 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009349 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009350 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009352 return 0;
9353}
9354
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009355
Benjamin Peterson29060642009-01-31 22:14:21 +00009356#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009357 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009358
Alexander Belopolsky40018472011-02-26 01:02:56 +00009359PyObject *
9360PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009361{
9362 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009363
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009364 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9365 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 if (PyUnicode_READY(left) == -1 ||
9367 PyUnicode_READY(right) == -1)
9368 return NULL;
9369 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9370 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009371 if (op == Py_EQ) {
9372 Py_INCREF(Py_False);
9373 return Py_False;
9374 }
9375 if (op == Py_NE) {
9376 Py_INCREF(Py_True);
9377 return Py_True;
9378 }
9379 }
9380 if (left == right)
9381 result = 0;
9382 else
9383 result = unicode_compare((PyUnicodeObject *)left,
9384 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009385
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009386 /* Convert the return value to a Boolean */
9387 switch (op) {
9388 case Py_EQ:
9389 v = TEST_COND(result == 0);
9390 break;
9391 case Py_NE:
9392 v = TEST_COND(result != 0);
9393 break;
9394 case Py_LE:
9395 v = TEST_COND(result <= 0);
9396 break;
9397 case Py_GE:
9398 v = TEST_COND(result >= 0);
9399 break;
9400 case Py_LT:
9401 v = TEST_COND(result == -1);
9402 break;
9403 case Py_GT:
9404 v = TEST_COND(result == 1);
9405 break;
9406 default:
9407 PyErr_BadArgument();
9408 return NULL;
9409 }
9410 Py_INCREF(v);
9411 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009412 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009413
Brian Curtindfc80e32011-08-10 20:28:54 -05009414 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009415}
9416
Alexander Belopolsky40018472011-02-26 01:02:56 +00009417int
9418PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009419{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009420 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 int kind1, kind2, kind;
9422 void *buf1, *buf2;
9423 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009424 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009425
9426 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009427 sub = PyUnicode_FromObject(element);
9428 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 PyErr_Format(PyExc_TypeError,
9430 "'in <string>' requires string as left operand, not %s",
9431 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009432 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 if (PyUnicode_READY(sub) == -1)
9435 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009436
Thomas Wouters477c8d52006-05-27 19:21:47 +00009437 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009438 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009439 Py_DECREF(sub);
9440 return -1;
9441 }
9442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 kind1 = PyUnicode_KIND(str);
9444 kind2 = PyUnicode_KIND(sub);
9445 kind = kind1 > kind2 ? kind1 : kind2;
9446 buf1 = PyUnicode_DATA(str);
9447 buf2 = PyUnicode_DATA(sub);
9448 if (kind1 != kind)
9449 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9450 if (!buf1) {
9451 Py_DECREF(sub);
9452 return -1;
9453 }
9454 if (kind2 != kind)
9455 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9456 if (!buf2) {
9457 Py_DECREF(sub);
9458 if (kind1 != kind) PyMem_Free(buf1);
9459 return -1;
9460 }
9461 len1 = PyUnicode_GET_LENGTH(str);
9462 len2 = PyUnicode_GET_LENGTH(sub);
9463
9464 switch(kind) {
9465 case PyUnicode_1BYTE_KIND:
9466 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9467 break;
9468 case PyUnicode_2BYTE_KIND:
9469 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9470 break;
9471 case PyUnicode_4BYTE_KIND:
9472 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9473 break;
9474 default:
9475 result = -1;
9476 assert(0);
9477 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009478
9479 Py_DECREF(str);
9480 Py_DECREF(sub);
9481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 if (kind1 != kind)
9483 PyMem_Free(buf1);
9484 if (kind2 != kind)
9485 PyMem_Free(buf2);
9486
Guido van Rossum403d68b2000-03-13 15:55:09 +00009487 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009488}
9489
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490/* Concat to string or Unicode object giving a new Unicode object. */
9491
Alexander Belopolsky40018472011-02-26 01:02:56 +00009492PyObject *
9493PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 PyObject *u = NULL, *v = NULL, *w;
9496 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497
9498 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505
9506 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009512 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 }
9515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009517 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 w = PyUnicode_New(
9521 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9522 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009524 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009525 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9526 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009527 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009528 v, 0,
9529 PyUnicode_GET_LENGTH(v)) < 0)
9530 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531 Py_DECREF(u);
9532 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536 Py_XDECREF(u);
9537 Py_XDECREF(v);
9538 return NULL;
9539}
9540
Walter Dörwald1ab83302007-05-18 17:15:44 +00009541void
9542PyUnicode_Append(PyObject **pleft, PyObject *right)
9543{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009544 PyObject *new;
9545 if (*pleft == NULL)
9546 return;
9547 if (right == NULL || !PyUnicode_Check(*pleft)) {
9548 Py_DECREF(*pleft);
9549 *pleft = NULL;
9550 return;
9551 }
9552 new = PyUnicode_Concat(*pleft, right);
9553 Py_DECREF(*pleft);
9554 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009555}
9556
9557void
9558PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9559{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009560 PyUnicode_Append(pleft, right);
9561 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009562}
9563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009564PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009565 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009567Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009568string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009569interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570
9571static PyObject *
9572unicode_count(PyUnicodeObject *self, PyObject *args)
9573{
9574 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009575 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009576 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009578 int kind1, kind2, kind;
9579 void *buf1, *buf2;
9580 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581
Jesus Ceaac451502011-04-20 17:09:23 +02009582 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9583 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009584 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 kind1 = PyUnicode_KIND(self);
9587 kind2 = PyUnicode_KIND(substring);
9588 kind = kind1 > kind2 ? kind1 : kind2;
9589 buf1 = PyUnicode_DATA(self);
9590 buf2 = PyUnicode_DATA(substring);
9591 if (kind1 != kind)
9592 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9593 if (!buf1) {
9594 Py_DECREF(substring);
9595 return NULL;
9596 }
9597 if (kind2 != kind)
9598 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9599 if (!buf2) {
9600 Py_DECREF(substring);
9601 if (kind1 != kind) PyMem_Free(buf1);
9602 return NULL;
9603 }
9604 len1 = PyUnicode_GET_LENGTH(self);
9605 len2 = PyUnicode_GET_LENGTH(substring);
9606
9607 ADJUST_INDICES(start, end, len1);
9608 switch(kind) {
9609 case PyUnicode_1BYTE_KIND:
9610 iresult = ucs1lib_count(
9611 ((Py_UCS1*)buf1) + start, end - start,
9612 buf2, len2, PY_SSIZE_T_MAX
9613 );
9614 break;
9615 case PyUnicode_2BYTE_KIND:
9616 iresult = ucs2lib_count(
9617 ((Py_UCS2*)buf1) + start, end - start,
9618 buf2, len2, PY_SSIZE_T_MAX
9619 );
9620 break;
9621 case PyUnicode_4BYTE_KIND:
9622 iresult = ucs4lib_count(
9623 ((Py_UCS4*)buf1) + start, end - start,
9624 buf2, len2, PY_SSIZE_T_MAX
9625 );
9626 break;
9627 default:
9628 assert(0); iresult = 0;
9629 }
9630
9631 result = PyLong_FromSsize_t(iresult);
9632
9633 if (kind1 != kind)
9634 PyMem_Free(buf1);
9635 if (kind2 != kind)
9636 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637
9638 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009639
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640 return result;
9641}
9642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009643PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009644 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009646Encode S using the codec registered for encoding. Default encoding\n\
9647is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009648handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009649a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9650'xmlcharrefreplace' as well as any other name registered with\n\
9651codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652
9653static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009654unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009656 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657 char *encoding = NULL;
9658 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009659
Benjamin Peterson308d6372009-09-18 21:42:35 +00009660 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9661 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009663 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009664}
9665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009666PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009667 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668\n\
9669Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009670If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671
9672static PyObject*
9673unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9674{
9675 Py_UNICODE *e;
9676 Py_UNICODE *p;
9677 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009678 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680 PyUnicodeObject *u;
9681 int tabsize = 8;
9682
9683 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009686 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9687 return NULL;
9688
Thomas Wouters7e474022000-07-16 12:04:32 +00009689 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009690 i = 0; /* chars up to and including most recent \n or \r */
9691 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9693 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009695 if (tabsize > 0) {
9696 incr = tabsize - (j % tabsize); /* cannot overflow */
9697 if (j > PY_SSIZE_T_MAX - incr)
9698 goto overflow1;
9699 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009700 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009703 if (j > PY_SSIZE_T_MAX - 1)
9704 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705 j++;
9706 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009707 if (i > PY_SSIZE_T_MAX - j)
9708 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009710 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711 }
9712 }
9713
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009714 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009715 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009716
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717 /* Second pass: create output string and fill it */
9718 u = _PyUnicode_New(i + j);
9719 if (!u)
9720 return NULL;
9721
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009722 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 q = _PyUnicode_WSTR(u); /* next output char */
9724 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009728 if (tabsize > 0) {
9729 i = tabsize - (j % tabsize);
9730 j += i;
9731 while (i--) {
9732 if (q >= qe)
9733 goto overflow2;
9734 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009735 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009736 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009737 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009738 else {
9739 if (q >= qe)
9740 goto overflow2;
9741 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009742 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743 if (*p == '\n' || *p == '\r')
9744 j = 0;
9745 }
9746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747 if (PyUnicode_READY(u) == -1) {
9748 Py_DECREF(u);
9749 return NULL;
9750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009752
9753 overflow2:
9754 Py_DECREF(u);
9755 overflow1:
9756 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758}
9759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009760PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009761 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762\n\
9763Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009764such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765arguments start and end are interpreted as in slice notation.\n\
9766\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009767Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768
9769static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771{
Jesus Ceaac451502011-04-20 17:09:23 +02009772 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009773 Py_ssize_t start;
9774 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009775 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776
Jesus Ceaac451502011-04-20 17:09:23 +02009777 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9778 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 if (PyUnicode_READY(self) == -1)
9782 return NULL;
9783 if (PyUnicode_READY(substring) == -1)
9784 return NULL;
9785
9786 result = any_find_slice(
9787 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9788 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009789 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790
9791 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 if (result == -2)
9794 return NULL;
9795
Christian Heimes217cfd12007-12-02 14:31:20 +00009796 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797}
9798
9799static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009800unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 Py_UCS4 ch;
9803
9804 if (PyUnicode_READY(self) == -1)
9805 return NULL;
9806 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807 PyErr_SetString(PyExc_IndexError, "string index out of range");
9808 return NULL;
9809 }
9810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9812 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813}
9814
Guido van Rossumc2504932007-09-18 19:42:40 +00009815/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009816 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009817static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009818unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819{
Guido van Rossumc2504932007-09-18 19:42:40 +00009820 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009821 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 if (_PyUnicode_HASH(self) != -1)
9824 return _PyUnicode_HASH(self);
9825 if (PyUnicode_READY(self) == -1)
9826 return -1;
9827 len = PyUnicode_GET_LENGTH(self);
9828
9829 /* The hash function as a macro, gets expanded three times below. */
9830#define HASH(P) \
9831 x = (Py_uhash_t)*P << 7; \
9832 while (--len >= 0) \
9833 x = (1000003*x) ^ (Py_uhash_t)*P++;
9834
9835 switch (PyUnicode_KIND(self)) {
9836 case PyUnicode_1BYTE_KIND: {
9837 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9838 HASH(c);
9839 break;
9840 }
9841 case PyUnicode_2BYTE_KIND: {
9842 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9843 HASH(s);
9844 break;
9845 }
9846 default: {
9847 Py_UCS4 *l;
9848 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9849 "Impossible switch case in unicode_hash");
9850 l = PyUnicode_4BYTE_DATA(self);
9851 HASH(l);
9852 break;
9853 }
9854 }
9855 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9856
Guido van Rossumc2504932007-09-18 19:42:40 +00009857 if (x == -1)
9858 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009860 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009864PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009865 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009867Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868
9869static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009872 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009873 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009874 Py_ssize_t start;
9875 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009876
Jesus Ceaac451502011-04-20 17:09:23 +02009877 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9878 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 if (PyUnicode_READY(self) == -1)
9882 return NULL;
9883 if (PyUnicode_READY(substring) == -1)
9884 return NULL;
9885
9886 result = any_find_slice(
9887 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9888 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009889 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890
9891 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 if (result == -2)
9894 return NULL;
9895
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896 if (result < 0) {
9897 PyErr_SetString(PyExc_ValueError, "substring not found");
9898 return NULL;
9899 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009900
Christian Heimes217cfd12007-12-02 14:31:20 +00009901 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902}
9903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009904PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009905 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009907Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009908at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909
9910static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009911unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 Py_ssize_t i, length;
9914 int kind;
9915 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916 int cased;
9917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 if (PyUnicode_READY(self) == -1)
9919 return NULL;
9920 length = PyUnicode_GET_LENGTH(self);
9921 kind = PyUnicode_KIND(self);
9922 data = PyUnicode_DATA(self);
9923
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 if (length == 1)
9926 return PyBool_FromLong(
9927 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009929 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009931 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009932
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 for (i = 0; i < length; i++) {
9935 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009936
Benjamin Peterson29060642009-01-31 22:14:21 +00009937 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9938 return PyBool_FromLong(0);
9939 else if (!cased && Py_UNICODE_ISLOWER(ch))
9940 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009942 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943}
9944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009945PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009946 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009948Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009949at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950
9951static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009952unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 Py_ssize_t i, length;
9955 int kind;
9956 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009957 int cased;
9958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 if (PyUnicode_READY(self) == -1)
9960 return NULL;
9961 length = PyUnicode_GET_LENGTH(self);
9962 kind = PyUnicode_KIND(self);
9963 data = PyUnicode_DATA(self);
9964
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 if (length == 1)
9967 return PyBool_FromLong(
9968 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009970 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009972 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009973
Guido van Rossumd57fd912000-03-10 22:53:23 +00009974 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 for (i = 0; i < length; i++) {
9976 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009977
Benjamin Peterson29060642009-01-31 22:14:21 +00009978 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9979 return PyBool_FromLong(0);
9980 else if (!cased && Py_UNICODE_ISUPPER(ch))
9981 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009983 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984}
9985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009986PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009987 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009988\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009989Return True if S is a titlecased string and there is at least one\n\
9990character in S, i.e. upper- and titlecase characters may only\n\
9991follow uncased characters and lowercase characters only cased ones.\n\
9992Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993
9994static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009995unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 Py_ssize_t i, length;
9998 int kind;
9999 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010000 int cased, previous_is_cased;
10001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 if (PyUnicode_READY(self) == -1)
10003 return NULL;
10004 length = PyUnicode_GET_LENGTH(self);
10005 kind = PyUnicode_KIND(self);
10006 data = PyUnicode_DATA(self);
10007
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 if (length == 1) {
10010 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10011 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10012 (Py_UNICODE_ISUPPER(ch) != 0));
10013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010015 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010017 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010018
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019 cased = 0;
10020 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 for (i = 0; i < length; i++) {
10022 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010023
Benjamin Peterson29060642009-01-31 22:14:21 +000010024 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10025 if (previous_is_cased)
10026 return PyBool_FromLong(0);
10027 previous_is_cased = 1;
10028 cased = 1;
10029 }
10030 else if (Py_UNICODE_ISLOWER(ch)) {
10031 if (!previous_is_cased)
10032 return PyBool_FromLong(0);
10033 previous_is_cased = 1;
10034 cased = 1;
10035 }
10036 else
10037 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010039 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040}
10041
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010042PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010043 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010045Return True if all characters in S are whitespace\n\
10046and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010047
10048static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010049unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 Py_ssize_t i, length;
10052 int kind;
10053 void *data;
10054
10055 if (PyUnicode_READY(self) == -1)
10056 return NULL;
10057 length = PyUnicode_GET_LENGTH(self);
10058 kind = PyUnicode_KIND(self);
10059 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 if (length == 1)
10063 return PyBool_FromLong(
10064 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010066 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010068 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 for (i = 0; i < length; i++) {
10071 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010072 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010073 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010074 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010075 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010076}
10077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010078PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010079 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010080\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010081Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010082and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010083
10084static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010085unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010086{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 Py_ssize_t i, length;
10088 int kind;
10089 void *data;
10090
10091 if (PyUnicode_READY(self) == -1)
10092 return NULL;
10093 length = PyUnicode_GET_LENGTH(self);
10094 kind = PyUnicode_KIND(self);
10095 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010096
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010097 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 if (length == 1)
10099 return PyBool_FromLong(
10100 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010101
10102 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010104 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 for (i = 0; i < length; i++) {
10107 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010108 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010109 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010110 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010111}
10112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010113PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010114 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010115\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010116Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010117and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010118
10119static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010120unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010121{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 int kind;
10123 void *data;
10124 Py_ssize_t len, i;
10125
10126 if (PyUnicode_READY(self) == -1)
10127 return NULL;
10128
10129 kind = PyUnicode_KIND(self);
10130 data = PyUnicode_DATA(self);
10131 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010132
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010133 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 if (len == 1) {
10135 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10136 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10137 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010138
10139 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010141 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 for (i = 0; i < len; i++) {
10144 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010145 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010146 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010147 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010148 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010149}
10150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010151PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010152 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010154Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010155False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156
10157static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010158unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 Py_ssize_t i, length;
10161 int kind;
10162 void *data;
10163
10164 if (PyUnicode_READY(self) == -1)
10165 return NULL;
10166 length = PyUnicode_GET_LENGTH(self);
10167 kind = PyUnicode_KIND(self);
10168 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 if (length == 1)
10172 return PyBool_FromLong(
10173 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010175 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010177 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 for (i = 0; i < length; i++) {
10180 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010181 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010183 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184}
10185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010186PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010187 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010189Return True if all characters in S are digits\n\
10190and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010191
10192static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010193unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 Py_ssize_t i, length;
10196 int kind;
10197 void *data;
10198
10199 if (PyUnicode_READY(self) == -1)
10200 return NULL;
10201 length = PyUnicode_GET_LENGTH(self);
10202 kind = PyUnicode_KIND(self);
10203 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 if (length == 1) {
10207 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10208 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010211 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010213 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 for (i = 0; i < length; i++) {
10216 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010217 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010219 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220}
10221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010222PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010223 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010225Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010226False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227
10228static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010229unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 Py_ssize_t i, length;
10232 int kind;
10233 void *data;
10234
10235 if (PyUnicode_READY(self) == -1)
10236 return NULL;
10237 length = PyUnicode_GET_LENGTH(self);
10238 kind = PyUnicode_KIND(self);
10239 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 if (length == 1)
10243 return PyBool_FromLong(
10244 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010246 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010248 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 for (i = 0; i < length; i++) {
10251 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010252 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010254 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255}
10256
Martin v. Löwis47383402007-08-15 07:32:56 +000010257int
10258PyUnicode_IsIdentifier(PyObject *self)
10259{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 int kind;
10261 void *data;
10262 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010263 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 if (PyUnicode_READY(self) == -1) {
10266 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010267 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 }
10269
10270 /* Special case for empty strings */
10271 if (PyUnicode_GET_LENGTH(self) == 0)
10272 return 0;
10273 kind = PyUnicode_KIND(self);
10274 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010275
10276 /* PEP 3131 says that the first character must be in
10277 XID_Start and subsequent characters in XID_Continue,
10278 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010279 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010280 letters, digits, underscore). However, given the current
10281 definition of XID_Start and XID_Continue, it is sufficient
10282 to check just for these, except that _ must be allowed
10283 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010285 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010286 return 0;
10287
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010288 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010290 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010291 return 1;
10292}
10293
10294PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010295 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010296\n\
10297Return True if S is a valid identifier according\n\
10298to the language definition.");
10299
10300static PyObject*
10301unicode_isidentifier(PyObject *self)
10302{
10303 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10304}
10305
Georg Brandl559e5d72008-06-11 18:37:52 +000010306PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010307 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010308\n\
10309Return True if all characters in S are considered\n\
10310printable in repr() or S is empty, False otherwise.");
10311
10312static PyObject*
10313unicode_isprintable(PyObject *self)
10314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 Py_ssize_t i, length;
10316 int kind;
10317 void *data;
10318
10319 if (PyUnicode_READY(self) == -1)
10320 return NULL;
10321 length = PyUnicode_GET_LENGTH(self);
10322 kind = PyUnicode_KIND(self);
10323 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010324
10325 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 if (length == 1)
10327 return PyBool_FromLong(
10328 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 for (i = 0; i < length; i++) {
10331 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010332 Py_RETURN_FALSE;
10333 }
10334 }
10335 Py_RETURN_TRUE;
10336}
10337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010338PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010339 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340\n\
10341Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010342iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343
10344static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010345unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010347 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348}
10349
Martin v. Löwis18e16552006-02-15 17:27:45 +000010350static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351unicode_length(PyUnicodeObject *self)
10352{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 if (PyUnicode_READY(self) == -1)
10354 return -1;
10355 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356}
10357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010358PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010359 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010361Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010362done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363
10364static PyObject *
10365unicode_ljust(PyUnicodeObject *self, PyObject *args)
10366{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010367 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 Py_UCS4 fillchar = ' ';
10369
10370 if (PyUnicode_READY(self) == -1)
10371 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010372
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010373 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374 return NULL;
10375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377 Py_INCREF(self);
10378 return (PyObject*) self;
10379 }
10380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382}
10383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010384PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010385 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010387Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388
10389static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010390unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392 return fixup(self, fixlower);
10393}
10394
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010395#define LEFTSTRIP 0
10396#define RIGHTSTRIP 1
10397#define BOTHSTRIP 2
10398
10399/* Arrays indexed by above */
10400static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10401
10402#define STRIPNAME(i) (stripformat[i]+3)
10403
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010404/* externally visible for str.strip(unicode) */
10405PyObject *
10406_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10407{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 void *data;
10409 int kind;
10410 Py_ssize_t i, j, len;
10411 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10414 return NULL;
10415
10416 kind = PyUnicode_KIND(self);
10417 data = PyUnicode_DATA(self);
10418 len = PyUnicode_GET_LENGTH(self);
10419 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10420 PyUnicode_DATA(sepobj),
10421 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010422
Benjamin Peterson14339b62009-01-31 16:36:08 +000010423 i = 0;
10424 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 while (i < len &&
10426 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010427 i++;
10428 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010429 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010430
Benjamin Peterson14339b62009-01-31 16:36:08 +000010431 j = len;
10432 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010433 do {
10434 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 } while (j >= i &&
10436 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010437 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010438 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010439
Victor Stinner12bab6d2011-10-01 01:53:49 +020010440 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441}
10442
10443PyObject*
10444PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10445{
10446 unsigned char *data;
10447 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010448 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449
Victor Stinner12bab6d2011-10-01 01:53:49 +020010450 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010452 if (PyUnicode_CheckExact(self)) {
10453 Py_INCREF(self);
10454 return self;
10455 }
10456 else
10457 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 }
10459
Victor Stinner12bab6d2011-10-01 01:53:49 +020010460 length = end - start;
10461 if (length == 1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 return unicode_getitem((PyUnicodeObject*)self, start);
10463
Victor Stinner12bab6d2011-10-01 01:53:49 +020010464 if (start < 0 || end < 0 || end > PyUnicode_GET_LENGTH(self)) {
10465 PyErr_SetString(PyExc_IndexError, "string index out of range");
10466 return NULL;
10467 }
10468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 if (PyUnicode_READY(self) == -1)
10470 return NULL;
10471 kind = PyUnicode_KIND(self);
10472 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010473 return PyUnicode_FromKindAndData(kind,
10474 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010475 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477
10478static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010479do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 int kind;
10482 void *data;
10483 Py_ssize_t len, i, j;
10484
10485 if (PyUnicode_READY(self) == -1)
10486 return NULL;
10487
10488 kind = PyUnicode_KIND(self);
10489 data = PyUnicode_DATA(self);
10490 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010491
Benjamin Peterson14339b62009-01-31 16:36:08 +000010492 i = 0;
10493 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010495 i++;
10496 }
10497 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010498
Benjamin Peterson14339b62009-01-31 16:36:08 +000010499 j = len;
10500 if (striptype != LEFTSTRIP) {
10501 do {
10502 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010504 j++;
10505 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010506
Victor Stinner12bab6d2011-10-01 01:53:49 +020010507 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508}
10509
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010510
10511static PyObject *
10512do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10513{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010514 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010515
Benjamin Peterson14339b62009-01-31 16:36:08 +000010516 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10517 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010518
Benjamin Peterson14339b62009-01-31 16:36:08 +000010519 if (sep != NULL && sep != Py_None) {
10520 if (PyUnicode_Check(sep))
10521 return _PyUnicode_XStrip(self, striptype, sep);
10522 else {
10523 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010524 "%s arg must be None or str",
10525 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010526 return NULL;
10527 }
10528 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010529
Benjamin Peterson14339b62009-01-31 16:36:08 +000010530 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010531}
10532
10533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010534PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010535 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010536\n\
10537Return a copy of the string S with leading and trailing\n\
10538whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010539If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010540
10541static PyObject *
10542unicode_strip(PyUnicodeObject *self, PyObject *args)
10543{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010544 if (PyTuple_GET_SIZE(args) == 0)
10545 return do_strip(self, BOTHSTRIP); /* Common case */
10546 else
10547 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010548}
10549
10550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010551PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010552 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010553\n\
10554Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010555If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010556
10557static PyObject *
10558unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10559{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010560 if (PyTuple_GET_SIZE(args) == 0)
10561 return do_strip(self, LEFTSTRIP); /* Common case */
10562 else
10563 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010564}
10565
10566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010567PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010568 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010569\n\
10570Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010571If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010572
10573static PyObject *
10574unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10575{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010576 if (PyTuple_GET_SIZE(args) == 0)
10577 return do_strip(self, RIGHTSTRIP); /* Common case */
10578 else
10579 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010580}
10581
10582
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010584unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585{
10586 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588
Georg Brandl222de0f2009-04-12 12:01:50 +000010589 if (len < 1) {
10590 Py_INCREF(unicode_empty);
10591 return (PyObject *)unicode_empty;
10592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593
Tim Peters7a29bd52001-09-12 03:03:31 +000010594 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595 /* no repeat, return original string */
10596 Py_INCREF(str);
10597 return (PyObject*) str;
10598 }
Tim Peters8f422462000-09-09 06:13:41 +000010599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 if (PyUnicode_READY(str) == -1)
10601 return NULL;
10602
Victor Stinner67ca64c2011-10-01 02:47:29 +020010603 if (len > PY_SSIZE_T_MAX / PyUnicode_GET_LENGTH(str)) {
10604 PyErr_SetString(PyExc_OverflowError,
10605 "repeated string is too long");
10606 return NULL;
10607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611 if (!u)
10612 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010613 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 if (PyUnicode_GET_LENGTH(str) == 1) {
10616 const int kind = PyUnicode_KIND(str);
10617 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10618 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010619 if (kind == PyUnicode_1BYTE_KIND)
10620 memset(to, (unsigned char)fill_char, len);
10621 else {
10622 for (n = 0; n < len; ++n)
10623 PyUnicode_WRITE(kind, to, n, fill_char);
10624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 }
10626 else {
10627 /* number of characters copied this far */
10628 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10629 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10630 char *to = (char *) PyUnicode_DATA(u);
10631 Py_MEMCPY(to, PyUnicode_DATA(str),
10632 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010633 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 n = (done <= nchars-done) ? done : nchars-done;
10635 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010636 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638 }
10639
10640 return (PyObject*) u;
10641}
10642
Alexander Belopolsky40018472011-02-26 01:02:56 +000010643PyObject *
10644PyUnicode_Replace(PyObject *obj,
10645 PyObject *subobj,
10646 PyObject *replobj,
10647 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648{
10649 PyObject *self;
10650 PyObject *str1;
10651 PyObject *str2;
10652 PyObject *result;
10653
10654 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010655 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010658 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010659 Py_DECREF(self);
10660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010661 }
10662 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010663 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010664 Py_DECREF(self);
10665 Py_DECREF(str1);
10666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010667 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669 Py_DECREF(self);
10670 Py_DECREF(str1);
10671 Py_DECREF(str2);
10672 return result;
10673}
10674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010675PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010676 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677\n\
10678Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010679old replaced by new. If the optional argument count is\n\
10680given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681
10682static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 PyObject *str1;
10686 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010687 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688 PyObject *result;
10689
Martin v. Löwis18e16552006-02-15 17:27:45 +000010690 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010693 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 str1 = PyUnicode_FromObject(str1);
10695 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10696 return NULL;
10697 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020010698 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010699 Py_DECREF(str1);
10700 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702
10703 result = replace(self, str1, str2, maxcount);
10704
10705 Py_DECREF(str1);
10706 Py_DECREF(str2);
10707 return result;
10708}
10709
Alexander Belopolsky40018472011-02-26 01:02:56 +000010710static PyObject *
10711unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010713 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 Py_ssize_t isize;
10715 Py_ssize_t osize, squote, dquote, i, o;
10716 Py_UCS4 max, quote;
10717 int ikind, okind;
10718 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010721 return NULL;
10722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 isize = PyUnicode_GET_LENGTH(unicode);
10724 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 /* Compute length of output, quote characters, and
10727 maximum character */
10728 osize = 2; /* quotes */
10729 max = 127;
10730 squote = dquote = 0;
10731 ikind = PyUnicode_KIND(unicode);
10732 for (i = 0; i < isize; i++) {
10733 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10734 switch (ch) {
10735 case '\'': squote++; osize++; break;
10736 case '"': dquote++; osize++; break;
10737 case '\\': case '\t': case '\r': case '\n':
10738 osize += 2; break;
10739 default:
10740 /* Fast-path ASCII */
10741 if (ch < ' ' || ch == 0x7f)
10742 osize += 4; /* \xHH */
10743 else if (ch < 0x7f)
10744 osize++;
10745 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10746 osize++;
10747 max = ch > max ? ch : max;
10748 }
10749 else if (ch < 0x100)
10750 osize += 4; /* \xHH */
10751 else if (ch < 0x10000)
10752 osize += 6; /* \uHHHH */
10753 else
10754 osize += 10; /* \uHHHHHHHH */
10755 }
10756 }
10757
10758 quote = '\'';
10759 if (squote) {
10760 if (dquote)
10761 /* Both squote and dquote present. Use squote,
10762 and escape them */
10763 osize += squote;
10764 else
10765 quote = '"';
10766 }
10767
10768 repr = PyUnicode_New(osize, max);
10769 if (repr == NULL)
10770 return NULL;
10771 okind = PyUnicode_KIND(repr);
10772 odata = PyUnicode_DATA(repr);
10773
10774 PyUnicode_WRITE(okind, odata, 0, quote);
10775 PyUnicode_WRITE(okind, odata, osize-1, quote);
10776
10777 for (i = 0, o = 1; i < isize; i++) {
10778 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010779
10780 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 if ((ch == quote) || (ch == '\\')) {
10782 PyUnicode_WRITE(okind, odata, o++, '\\');
10783 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010784 continue;
10785 }
10786
Benjamin Peterson29060642009-01-31 22:14:21 +000010787 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010788 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 PyUnicode_WRITE(okind, odata, o++, '\\');
10790 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010791 }
10792 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 PyUnicode_WRITE(okind, odata, o++, '\\');
10794 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010795 }
10796 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 PyUnicode_WRITE(okind, odata, o++, '\\');
10798 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010799 }
10800
10801 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010802 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 PyUnicode_WRITE(okind, odata, o++, '\\');
10804 PyUnicode_WRITE(okind, odata, o++, 'x');
10805 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10806 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010807 }
10808
Georg Brandl559e5d72008-06-11 18:37:52 +000010809 /* Copy ASCII characters as-is */
10810 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010812 }
10813
Benjamin Peterson29060642009-01-31 22:14:21 +000010814 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010815 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010816 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010817 (categories Z* and C* except ASCII space)
10818 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010820 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 if (ch <= 0xff) {
10822 PyUnicode_WRITE(okind, odata, o++, '\\');
10823 PyUnicode_WRITE(okind, odata, o++, 'x');
10824 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10825 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010826 }
10827 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 else if (ch >= 0x10000) {
10829 PyUnicode_WRITE(okind, odata, o++, '\\');
10830 PyUnicode_WRITE(okind, odata, o++, 'U');
10831 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10832 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10833 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10834 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10835 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10836 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10837 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10838 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010839 }
10840 /* Map 16-bit characters to '\uxxxx' */
10841 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 PyUnicode_WRITE(okind, odata, o++, '\\');
10843 PyUnicode_WRITE(okind, odata, o++, 'u');
10844 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10845 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10846 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10847 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010848 }
10849 }
10850 /* Copy characters as-is */
10851 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010852 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010853 }
10854 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010857 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858}
10859
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010860PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010861 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862\n\
10863Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010864such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865arguments start and end are interpreted as in slice notation.\n\
10866\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010867Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868
10869static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871{
Jesus Ceaac451502011-04-20 17:09:23 +020010872 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010873 Py_ssize_t start;
10874 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010875 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876
Jesus Ceaac451502011-04-20 17:09:23 +020010877 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10878 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 if (PyUnicode_READY(self) == -1)
10882 return NULL;
10883 if (PyUnicode_READY(substring) == -1)
10884 return NULL;
10885
10886 result = any_find_slice(
10887 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10888 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010889 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
10891 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 if (result == -2)
10894 return NULL;
10895
Christian Heimes217cfd12007-12-02 14:31:20 +000010896 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897}
10898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010899PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010900 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010902Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903
10904static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906{
Jesus Ceaac451502011-04-20 17:09:23 +020010907 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010908 Py_ssize_t start;
10909 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010910 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911
Jesus Ceaac451502011-04-20 17:09:23 +020010912 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10913 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 if (PyUnicode_READY(self) == -1)
10917 return NULL;
10918 if (PyUnicode_READY(substring) == -1)
10919 return NULL;
10920
10921 result = any_find_slice(
10922 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10923 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010924 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925
10926 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 if (result == -2)
10929 return NULL;
10930
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931 if (result < 0) {
10932 PyErr_SetString(PyExc_ValueError, "substring not found");
10933 return NULL;
10934 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935
Christian Heimes217cfd12007-12-02 14:31:20 +000010936 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937}
10938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010939PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010940 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010942Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010943done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944
10945static PyObject *
10946unicode_rjust(PyUnicodeObject *self, PyObject *args)
10947{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010948 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 Py_UCS4 fillchar = ' ';
10950
Victor Stinnere9a29352011-10-01 02:14:59 +020010951 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010953
Victor Stinnere9a29352011-10-01 02:14:59 +020010954 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955 return NULL;
10956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958 Py_INCREF(self);
10959 return (PyObject*) self;
10960 }
10961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963}
10964
Alexander Belopolsky40018472011-02-26 01:02:56 +000010965PyObject *
10966PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967{
10968 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010969
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970 s = PyUnicode_FromObject(s);
10971 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010972 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010973 if (sep != NULL) {
10974 sep = PyUnicode_FromObject(sep);
10975 if (sep == NULL) {
10976 Py_DECREF(s);
10977 return NULL;
10978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979 }
10980
10981 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10982
10983 Py_DECREF(s);
10984 Py_XDECREF(sep);
10985 return result;
10986}
10987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010988PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010989 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990\n\
10991Return a list of the words in S, using sep as the\n\
10992delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000010993splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000010994whitespace string is a separator and empty strings are\n\
10995removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996
10997static PyObject*
10998unicode_split(PyUnicodeObject *self, PyObject *args)
10999{
11000 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011001 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002
Martin v. Löwis18e16552006-02-15 17:27:45 +000011003 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 return NULL;
11005
11006 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011007 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011009 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012}
11013
Thomas Wouters477c8d52006-05-27 19:21:47 +000011014PyObject *
11015PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11016{
11017 PyObject* str_obj;
11018 PyObject* sep_obj;
11019 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 int kind1, kind2, kind;
11021 void *buf1 = NULL, *buf2 = NULL;
11022 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011023
11024 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011025 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011026 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011027 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011029 Py_DECREF(str_obj);
11030 return NULL;
11031 }
11032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 kind1 = PyUnicode_KIND(str_in);
11034 kind2 = PyUnicode_KIND(sep_obj);
11035 kind = kind1 > kind2 ? kind1 : kind2;
11036 buf1 = PyUnicode_DATA(str_in);
11037 if (kind1 != kind)
11038 buf1 = _PyUnicode_AsKind(str_in, kind);
11039 if (!buf1)
11040 goto onError;
11041 buf2 = PyUnicode_DATA(sep_obj);
11042 if (kind2 != kind)
11043 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11044 if (!buf2)
11045 goto onError;
11046 len1 = PyUnicode_GET_LENGTH(str_obj);
11047 len2 = PyUnicode_GET_LENGTH(sep_obj);
11048
11049 switch(PyUnicode_KIND(str_in)) {
11050 case PyUnicode_1BYTE_KIND:
11051 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11052 break;
11053 case PyUnicode_2BYTE_KIND:
11054 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11055 break;
11056 case PyUnicode_4BYTE_KIND:
11057 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11058 break;
11059 default:
11060 assert(0);
11061 out = 0;
11062 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011063
11064 Py_DECREF(sep_obj);
11065 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 if (kind1 != kind)
11067 PyMem_Free(buf1);
11068 if (kind2 != kind)
11069 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011070
11071 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 onError:
11073 Py_DECREF(sep_obj);
11074 Py_DECREF(str_obj);
11075 if (kind1 != kind && buf1)
11076 PyMem_Free(buf1);
11077 if (kind2 != kind && buf2)
11078 PyMem_Free(buf2);
11079 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011080}
11081
11082
11083PyObject *
11084PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11085{
11086 PyObject* str_obj;
11087 PyObject* sep_obj;
11088 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089 int kind1, kind2, kind;
11090 void *buf1 = NULL, *buf2 = NULL;
11091 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011092
11093 str_obj = PyUnicode_FromObject(str_in);
11094 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011095 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011096 sep_obj = PyUnicode_FromObject(sep_in);
11097 if (!sep_obj) {
11098 Py_DECREF(str_obj);
11099 return NULL;
11100 }
11101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102 kind1 = PyUnicode_KIND(str_in);
11103 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011104 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 buf1 = PyUnicode_DATA(str_in);
11106 if (kind1 != kind)
11107 buf1 = _PyUnicode_AsKind(str_in, kind);
11108 if (!buf1)
11109 goto onError;
11110 buf2 = PyUnicode_DATA(sep_obj);
11111 if (kind2 != kind)
11112 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11113 if (!buf2)
11114 goto onError;
11115 len1 = PyUnicode_GET_LENGTH(str_obj);
11116 len2 = PyUnicode_GET_LENGTH(sep_obj);
11117
11118 switch(PyUnicode_KIND(str_in)) {
11119 case PyUnicode_1BYTE_KIND:
11120 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11121 break;
11122 case PyUnicode_2BYTE_KIND:
11123 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11124 break;
11125 case PyUnicode_4BYTE_KIND:
11126 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11127 break;
11128 default:
11129 assert(0);
11130 out = 0;
11131 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011132
11133 Py_DECREF(sep_obj);
11134 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011135 if (kind1 != kind)
11136 PyMem_Free(buf1);
11137 if (kind2 != kind)
11138 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011139
11140 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 onError:
11142 Py_DECREF(sep_obj);
11143 Py_DECREF(str_obj);
11144 if (kind1 != kind && buf1)
11145 PyMem_Free(buf1);
11146 if (kind2 != kind && buf2)
11147 PyMem_Free(buf2);
11148 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011149}
11150
11151PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011152 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011153\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011154Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011155the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011156found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011157
11158static PyObject*
11159unicode_partition(PyUnicodeObject *self, PyObject *separator)
11160{
11161 return PyUnicode_Partition((PyObject *)self, separator);
11162}
11163
11164PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011165 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011167Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011168the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011169separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011170
11171static PyObject*
11172unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11173{
11174 return PyUnicode_RPartition((PyObject *)self, separator);
11175}
11176
Alexander Belopolsky40018472011-02-26 01:02:56 +000011177PyObject *
11178PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011179{
11180 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011181
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011182 s = PyUnicode_FromObject(s);
11183 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011184 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 if (sep != NULL) {
11186 sep = PyUnicode_FromObject(sep);
11187 if (sep == NULL) {
11188 Py_DECREF(s);
11189 return NULL;
11190 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011191 }
11192
11193 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11194
11195 Py_DECREF(s);
11196 Py_XDECREF(sep);
11197 return result;
11198}
11199
11200PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011201 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011202\n\
11203Return a list of the words in S, using sep as the\n\
11204delimiter string, starting at the end of the string and\n\
11205working to the front. If maxsplit is given, at most maxsplit\n\
11206splits are done. If sep is not specified, any whitespace string\n\
11207is a separator.");
11208
11209static PyObject*
11210unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11211{
11212 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011213 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011214
Martin v. Löwis18e16552006-02-15 17:27:45 +000011215 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011216 return NULL;
11217
11218 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011219 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011220 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011221 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011222 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011223 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011224}
11225
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011226PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011227 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228\n\
11229Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011230Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011231is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232
11233static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011234unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011236 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011237 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011239 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11240 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241 return NULL;
11242
Guido van Rossum86662912000-04-11 15:38:46 +000011243 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244}
11245
11246static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011247PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248{
Walter Dörwald346737f2007-05-31 10:44:43 +000011249 if (PyUnicode_CheckExact(self)) {
11250 Py_INCREF(self);
11251 return self;
11252 } else
11253 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011254 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255}
11256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011257PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259\n\
11260Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011261and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262
11263static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011264unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 return fixup(self, fixswapcase);
11267}
11268
Georg Brandlceee0772007-11-27 23:48:05 +000011269PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011271\n\
11272Return a translation table usable for str.translate().\n\
11273If there is only one argument, it must be a dictionary mapping Unicode\n\
11274ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011275Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011276If there are two arguments, they must be strings of equal length, and\n\
11277in the resulting dictionary, each character in x will be mapped to the\n\
11278character at the same position in y. If there is a third argument, it\n\
11279must be a string, whose characters will be mapped to None in the result.");
11280
11281static PyObject*
11282unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11283{
11284 PyObject *x, *y = NULL, *z = NULL;
11285 PyObject *new = NULL, *key, *value;
11286 Py_ssize_t i = 0;
11287 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011288
Georg Brandlceee0772007-11-27 23:48:05 +000011289 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11290 return NULL;
11291 new = PyDict_New();
11292 if (!new)
11293 return NULL;
11294 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 int x_kind, y_kind, z_kind;
11296 void *x_data, *y_data, *z_data;
11297
Georg Brandlceee0772007-11-27 23:48:05 +000011298 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011299 if (!PyUnicode_Check(x)) {
11300 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11301 "be a string if there is a second argument");
11302 goto err;
11303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011305 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11306 "arguments must have equal length");
11307 goto err;
11308 }
11309 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011310 x_kind = PyUnicode_KIND(x);
11311 y_kind = PyUnicode_KIND(y);
11312 x_data = PyUnicode_DATA(x);
11313 y_data = PyUnicode_DATA(y);
11314 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11315 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11316 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011317 if (!key || !value)
11318 goto err;
11319 res = PyDict_SetItem(new, key, value);
11320 Py_DECREF(key);
11321 Py_DECREF(value);
11322 if (res < 0)
11323 goto err;
11324 }
11325 /* create entries for deleting chars in z */
11326 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 z_kind = PyUnicode_KIND(z);
11328 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011329 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011331 if (!key)
11332 goto err;
11333 res = PyDict_SetItem(new, key, Py_None);
11334 Py_DECREF(key);
11335 if (res < 0)
11336 goto err;
11337 }
11338 }
11339 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 int kind;
11341 void *data;
11342
Georg Brandlceee0772007-11-27 23:48:05 +000011343 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011344 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011345 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11346 "to maketrans it must be a dict");
11347 goto err;
11348 }
11349 /* copy entries into the new dict, converting string keys to int keys */
11350 while (PyDict_Next(x, &i, &key, &value)) {
11351 if (PyUnicode_Check(key)) {
11352 /* convert string keys to integer keys */
11353 PyObject *newkey;
11354 if (PyUnicode_GET_SIZE(key) != 1) {
11355 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11356 "table must be of length 1");
11357 goto err;
11358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 kind = PyUnicode_KIND(key);
11360 data = PyUnicode_DATA(key);
11361 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011362 if (!newkey)
11363 goto err;
11364 res = PyDict_SetItem(new, newkey, value);
11365 Py_DECREF(newkey);
11366 if (res < 0)
11367 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011368 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011369 /* just keep integer keys */
11370 if (PyDict_SetItem(new, key, value) < 0)
11371 goto err;
11372 } else {
11373 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11374 "be strings or integers");
11375 goto err;
11376 }
11377 }
11378 }
11379 return new;
11380 err:
11381 Py_DECREF(new);
11382 return NULL;
11383}
11384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011385PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011386 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387\n\
11388Return a copy of the string S, where all characters have been mapped\n\
11389through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011390Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011391Unmapped characters are left untouched. Characters mapped to None\n\
11392are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393
11394static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011395unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398}
11399
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011400PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011401 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011403Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404
11405static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011406unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408 return fixup(self, fixupper);
11409}
11410
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011411PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011412 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011414Pad a numeric string S with zeros on the left, to fill a field\n\
11415of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416
11417static PyObject *
11418unicode_zfill(PyUnicodeObject *self, PyObject *args)
11419{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011420 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011422 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423 int kind;
11424 void *data;
11425 Py_UCS4 chr;
11426
11427 if (PyUnicode_READY(self) == -1)
11428 return NULL;
11429
Martin v. Löwis18e16552006-02-15 17:27:45 +000011430 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431 return NULL;
11432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011434 if (PyUnicode_CheckExact(self)) {
11435 Py_INCREF(self);
11436 return (PyObject*) self;
11437 }
11438 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011439 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440 }
11441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
11444 u = pad(self, fill, 0, '0');
11445
Walter Dörwald068325e2002-04-15 13:36:47 +000011446 if (u == NULL)
11447 return NULL;
11448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 kind = PyUnicode_KIND(u);
11450 data = PyUnicode_DATA(u);
11451 chr = PyUnicode_READ(kind, data, fill);
11452
11453 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 PyUnicode_WRITE(kind, data, 0, chr);
11456 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457 }
11458
11459 return (PyObject*) u;
11460}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
11462#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011463static PyObject *
11464unicode__decimal2ascii(PyObject *self)
11465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011467}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468#endif
11469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011470PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011473Return True if S starts with the specified prefix, False otherwise.\n\
11474With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011475With optional end, stop comparing S at that position.\n\
11476prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477
11478static PyObject *
11479unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011482 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011484 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011485 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011486 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
Jesus Ceaac451502011-04-20 17:09:23 +020011488 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011490 if (PyTuple_Check(subobj)) {
11491 Py_ssize_t i;
11492 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11493 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011495 if (substring == NULL)
11496 return NULL;
11497 result = tailmatch(self, substring, start, end, -1);
11498 Py_DECREF(substring);
11499 if (result) {
11500 Py_RETURN_TRUE;
11501 }
11502 }
11503 /* nothing matched */
11504 Py_RETURN_FALSE;
11505 }
11506 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011507 if (substring == NULL) {
11508 if (PyErr_ExceptionMatches(PyExc_TypeError))
11509 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11510 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011512 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011513 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011515 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516}
11517
11518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011519PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011520 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011522Return True if S ends with the specified suffix, False otherwise.\n\
11523With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011524With optional end, stop comparing S at that position.\n\
11525suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526
11527static PyObject *
11528unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011529 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011531 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011533 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011534 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011535 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
Jesus Ceaac451502011-04-20 17:09:23 +020011537 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011538 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011539 if (PyTuple_Check(subobj)) {
11540 Py_ssize_t i;
11541 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11542 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011543 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011544 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011546 result = tailmatch(self, substring, start, end, +1);
11547 Py_DECREF(substring);
11548 if (result) {
11549 Py_RETURN_TRUE;
11550 }
11551 }
11552 Py_RETURN_FALSE;
11553 }
11554 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011555 if (substring == NULL) {
11556 if (PyErr_ExceptionMatches(PyExc_TypeError))
11557 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11558 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011560 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011561 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011563 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564}
11565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011567
11568PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011569 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011570\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011571Return a formatted version of S, using substitutions from args and kwargs.\n\
11572The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011573
Eric Smith27bbca62010-11-04 17:06:58 +000011574PyDoc_STRVAR(format_map__doc__,
11575 "S.format_map(mapping) -> str\n\
11576\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011577Return a formatted version of S, using substitutions from mapping.\n\
11578The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011579
Eric Smith4a7d76d2008-05-30 18:10:19 +000011580static PyObject *
11581unicode__format__(PyObject* self, PyObject* args)
11582{
11583 PyObject *format_spec;
11584
11585 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11586 return NULL;
11587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011588 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11589 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011590}
11591
Eric Smith8c663262007-08-25 02:26:07 +000011592PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011594\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011595Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011596
11597static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011598unicode__sizeof__(PyUnicodeObject *v)
11599{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 Py_ssize_t size;
11601
11602 /* If it's a compact object, account for base structure +
11603 character data. */
11604 if (PyUnicode_IS_COMPACT_ASCII(v))
11605 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11606 else if (PyUnicode_IS_COMPACT(v))
11607 size = sizeof(PyCompactUnicodeObject) +
11608 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11609 else {
11610 /* If it is a two-block object, account for base object, and
11611 for character block if present. */
11612 size = sizeof(PyUnicodeObject);
11613 if (v->data.any)
11614 size += (PyUnicode_GET_LENGTH(v) + 1) *
11615 PyUnicode_CHARACTER_SIZE(v);
11616 }
11617 /* If the wstr pointer is present, account for it unless it is shared
11618 with the data pointer. Since PyUnicode_DATA will crash if the object
11619 is not ready, check whether it's either not ready (in which case the
11620 data is entirely in wstr) or if the data is not shared. */
11621 if (_PyUnicode_WSTR(v) &&
11622 (!PyUnicode_IS_READY(v) ||
11623 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11624 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11625 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11626 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11627
11628 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011629}
11630
11631PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011632 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011633
11634static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011635unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011636{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011637 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 if (!copy)
11639 return NULL;
11640 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011641}
11642
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643static PyMethodDef unicode_methods[] = {
11644
11645 /* Order is according to common usage: often used methods should
11646 appear first, since lookup is done sequentially. */
11647
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011648 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011649 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11650 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011651 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011652 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11653 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11654 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11655 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11656 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11657 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11658 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011659 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011660 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11661 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11662 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011663 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011664 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11665 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11666 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011667 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011668 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011669 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011670 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011671 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11672 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11673 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11674 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11675 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11676 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11677 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11678 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11679 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11680 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11681 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11682 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11683 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11684 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011685 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011686 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011687 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011688 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011689 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011690 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011691 {"maketrans", (PyCFunction) unicode_maketrans,
11692 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011693 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011694#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011695 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696#endif
11697
11698#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011699 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011700 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701#endif
11702
Benjamin Peterson14339b62009-01-31 16:36:08 +000011703 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704 {NULL, NULL}
11705};
11706
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011707static PyObject *
11708unicode_mod(PyObject *v, PyObject *w)
11709{
Brian Curtindfc80e32011-08-10 20:28:54 -050011710 if (!PyUnicode_Check(v))
11711 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011712 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011713}
11714
11715static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011716 0, /*nb_add*/
11717 0, /*nb_subtract*/
11718 0, /*nb_multiply*/
11719 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011720};
11721
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011723 (lenfunc) unicode_length, /* sq_length */
11724 PyUnicode_Concat, /* sq_concat */
11725 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11726 (ssizeargfunc) unicode_getitem, /* sq_item */
11727 0, /* sq_slice */
11728 0, /* sq_ass_item */
11729 0, /* sq_ass_slice */
11730 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731};
11732
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011733static PyObject*
11734unicode_subscript(PyUnicodeObject* self, PyObject* item)
11735{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736 if (PyUnicode_READY(self) == -1)
11737 return NULL;
11738
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011739 if (PyIndex_Check(item)) {
11740 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011741 if (i == -1 && PyErr_Occurred())
11742 return NULL;
11743 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011745 return unicode_getitem(self, i);
11746 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011747 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011749 Py_UNICODE* result_buf;
11750 PyObject* result;
11751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011754 return NULL;
11755 }
11756
11757 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 return PyUnicode_New(0, 0);
11759 } else if (start == 0 && step == 1 &&
11760 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011761 PyUnicode_CheckExact(self)) {
11762 Py_INCREF(self);
11763 return (PyObject *)self;
11764 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011765 return PyUnicode_Substring((PyObject*)self,
11766 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011767 } else {
11768 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011769 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11770 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011771
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 if (result_buf == NULL)
11773 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011774
11775 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11776 result_buf[i] = source_buf[cur];
11777 }
Tim Petersced69f82003-09-16 20:30:58 +000011778
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011779 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011780 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011781 return result;
11782 }
11783 } else {
11784 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11785 return NULL;
11786 }
11787}
11788
11789static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011790 (lenfunc)unicode_length, /* mp_length */
11791 (binaryfunc)unicode_subscript, /* mp_subscript */
11792 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011793};
11794
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796/* Helpers for PyUnicode_Format() */
11797
11798static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011799getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011801 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011803 (*p_argidx)++;
11804 if (arglen < 0)
11805 return args;
11806 else
11807 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808 }
11809 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811 return NULL;
11812}
11813
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011814/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011816static PyObject *
11817formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011819 char *p;
11820 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011822
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823 x = PyFloat_AsDouble(v);
11824 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011825 return NULL;
11826
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011828 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011829
Eric Smith0923d1d2009-04-16 20:16:10 +000011830 p = PyOS_double_to_string(x, type, prec,
11831 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011832 if (p == NULL)
11833 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011834 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011835 PyMem_Free(p);
11836 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837}
11838
Tim Peters38fd5b62000-09-21 05:43:11 +000011839static PyObject*
11840formatlong(PyObject *val, int flags, int prec, int type)
11841{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011842 char *buf;
11843 int len;
11844 PyObject *str; /* temporary string object. */
11845 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011846
Benjamin Peterson14339b62009-01-31 16:36:08 +000011847 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11848 if (!str)
11849 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011851 Py_DECREF(str);
11852 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011853}
11854
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011857 size_t buflen,
11858 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011860 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011861 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 if (PyUnicode_GET_LENGTH(v) == 1) {
11863 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 buf[1] = '\0';
11865 return 1;
11866 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011867 goto onError;
11868 }
11869 else {
11870 /* Integer input truncated to a character */
11871 long x;
11872 x = PyLong_AsLong(v);
11873 if (x == -1 && PyErr_Occurred())
11874 goto onError;
11875
11876 if (x < 0 || x > 0x10ffff) {
11877 PyErr_SetString(PyExc_OverflowError,
11878 "%c arg not in range(0x110000)");
11879 return -1;
11880 }
11881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011883 buf[1] = '\0';
11884 return 1;
11885 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011886
Benjamin Peterson29060642009-01-31 22:14:21 +000011887 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011888 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011889 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011890 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891}
11892
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011893/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011894 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011895*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011896#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011897
Alexander Belopolsky40018472011-02-26 01:02:56 +000011898PyObject *
11899PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 void *fmt;
11902 int fmtkind;
11903 PyObject *result;
11904 Py_UCS4 *res, *res0;
11905 Py_UCS4 max;
11906 int kind;
11907 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011911
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 PyErr_BadInternalCall();
11914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11917 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 fmt = PyUnicode_DATA(uformat);
11920 fmtkind = PyUnicode_KIND(uformat);
11921 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11922 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923
11924 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11926 if (res0 == NULL) {
11927 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011928 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930
11931 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011932 arglen = PyTuple_Size(args);
11933 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 }
11935 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 arglen = -1;
11937 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011939 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011940 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942
11943 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 if (--rescnt < 0) {
11946 rescnt = fmtcnt + 100;
11947 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11949 if (res0 == NULL){
11950 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011951 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 }
11953 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011957 }
11958 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 /* Got a format specifier */
11960 int flags = 0;
11961 Py_ssize_t width = -1;
11962 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 Py_UCS4 c = '\0';
11964 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 int isnumok;
11966 PyObject *v = NULL;
11967 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 void *pbuf;
11969 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 Py_ssize_t len, len1;
11972 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 fmtpos++;
11975 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
11976 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 Py_ssize_t keylen;
11978 PyObject *key;
11979 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000011980
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 if (dict == NULL) {
11982 PyErr_SetString(PyExc_TypeError,
11983 "format requires a mapping");
11984 goto onError;
11985 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011987 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 /* Skip over balanced parentheses */
11990 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011998 if (fmtcnt < 0 || pcount > 0) {
11999 PyErr_SetString(PyExc_ValueError,
12000 "incomplete format key");
12001 goto onError;
12002 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012003 key = PyUnicode_Substring((PyObject*)uformat,
12004 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 if (key == NULL)
12006 goto onError;
12007 if (args_owned) {
12008 Py_DECREF(args);
12009 args_owned = 0;
12010 }
12011 args = PyObject_GetItem(dict, key);
12012 Py_DECREF(key);
12013 if (args == NULL) {
12014 goto onError;
12015 }
12016 args_owned = 1;
12017 arglen = -1;
12018 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012019 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 case '-': flags |= F_LJUST; continue;
12023 case '+': flags |= F_SIGN; continue;
12024 case ' ': flags |= F_BLANK; continue;
12025 case '#': flags |= F_ALT; continue;
12026 case '0': flags |= F_ZERO; continue;
12027 }
12028 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012029 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 if (c == '*') {
12031 v = getnextarg(args, arglen, &argidx);
12032 if (v == NULL)
12033 goto onError;
12034 if (!PyLong_Check(v)) {
12035 PyErr_SetString(PyExc_TypeError,
12036 "* wants int");
12037 goto onError;
12038 }
12039 width = PyLong_AsLong(v);
12040 if (width == -1 && PyErr_Occurred())
12041 goto onError;
12042 if (width < 0) {
12043 flags |= F_LJUST;
12044 width = -width;
12045 }
12046 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012048 }
12049 else if (c >= '0' && c <= '9') {
12050 width = c - '0';
12051 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012053 if (c < '0' || c > '9')
12054 break;
12055 if ((width*10) / 10 != width) {
12056 PyErr_SetString(PyExc_ValueError,
12057 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012058 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012059 }
12060 width = width*10 + (c - '0');
12061 }
12062 }
12063 if (c == '.') {
12064 prec = 0;
12065 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012067 if (c == '*') {
12068 v = getnextarg(args, arglen, &argidx);
12069 if (v == NULL)
12070 goto onError;
12071 if (!PyLong_Check(v)) {
12072 PyErr_SetString(PyExc_TypeError,
12073 "* wants int");
12074 goto onError;
12075 }
12076 prec = PyLong_AsLong(v);
12077 if (prec == -1 && PyErr_Occurred())
12078 goto onError;
12079 if (prec < 0)
12080 prec = 0;
12081 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012083 }
12084 else if (c >= '0' && c <= '9') {
12085 prec = c - '0';
12086 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012088 if (c < '0' || c > '9')
12089 break;
12090 if ((prec*10) / 10 != prec) {
12091 PyErr_SetString(PyExc_ValueError,
12092 "prec too big");
12093 goto onError;
12094 }
12095 prec = prec*10 + (c - '0');
12096 }
12097 }
12098 } /* prec */
12099 if (fmtcnt >= 0) {
12100 if (c == 'h' || c == 'l' || c == 'L') {
12101 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012103 }
12104 }
12105 if (fmtcnt < 0) {
12106 PyErr_SetString(PyExc_ValueError,
12107 "incomplete format");
12108 goto onError;
12109 }
12110 if (c != '%') {
12111 v = getnextarg(args, arglen, &argidx);
12112 if (v == NULL)
12113 goto onError;
12114 }
12115 sign = 0;
12116 fill = ' ';
12117 switch (c) {
12118
12119 case '%':
12120 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012122 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012124 len = 1;
12125 break;
12126
12127 case 's':
12128 case 'r':
12129 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012130 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 temp = v;
12132 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012133 }
12134 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012135 if (c == 's')
12136 temp = PyObject_Str(v);
12137 else if (c == 'r')
12138 temp = PyObject_Repr(v);
12139 else
12140 temp = PyObject_ASCII(v);
12141 if (temp == NULL)
12142 goto onError;
12143 if (PyUnicode_Check(temp))
12144 /* nothing to do */;
12145 else {
12146 Py_DECREF(temp);
12147 PyErr_SetString(PyExc_TypeError,
12148 "%s argument has non-string str()");
12149 goto onError;
12150 }
12151 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 if (PyUnicode_READY(temp) == -1) {
12153 Py_CLEAR(temp);
12154 goto onError;
12155 }
12156 pbuf = PyUnicode_DATA(temp);
12157 kind = PyUnicode_KIND(temp);
12158 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012159 if (prec >= 0 && len > prec)
12160 len = prec;
12161 break;
12162
12163 case 'i':
12164 case 'd':
12165 case 'u':
12166 case 'o':
12167 case 'x':
12168 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012169 isnumok = 0;
12170 if (PyNumber_Check(v)) {
12171 PyObject *iobj=NULL;
12172
12173 if (PyLong_Check(v)) {
12174 iobj = v;
12175 Py_INCREF(iobj);
12176 }
12177 else {
12178 iobj = PyNumber_Long(v);
12179 }
12180 if (iobj!=NULL) {
12181 if (PyLong_Check(iobj)) {
12182 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012183 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012184 Py_DECREF(iobj);
12185 if (!temp)
12186 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012187 if (PyUnicode_READY(temp) == -1) {
12188 Py_CLEAR(temp);
12189 goto onError;
12190 }
12191 pbuf = PyUnicode_DATA(temp);
12192 kind = PyUnicode_KIND(temp);
12193 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012194 sign = 1;
12195 }
12196 else {
12197 Py_DECREF(iobj);
12198 }
12199 }
12200 }
12201 if (!isnumok) {
12202 PyErr_Format(PyExc_TypeError,
12203 "%%%c format: a number is required, "
12204 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12205 goto onError;
12206 }
12207 if (flags & F_ZERO)
12208 fill = '0';
12209 break;
12210
12211 case 'e':
12212 case 'E':
12213 case 'f':
12214 case 'F':
12215 case 'g':
12216 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012217 temp = formatfloat(v, flags, prec, c);
12218 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012219 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 if (PyUnicode_READY(temp) == -1) {
12221 Py_CLEAR(temp);
12222 goto onError;
12223 }
12224 pbuf = PyUnicode_DATA(temp);
12225 kind = PyUnicode_KIND(temp);
12226 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012227 sign = 1;
12228 if (flags & F_ZERO)
12229 fill = '0';
12230 break;
12231
12232 case 'c':
12233 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012235 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012236 if (len < 0)
12237 goto onError;
12238 break;
12239
12240 default:
12241 PyErr_Format(PyExc_ValueError,
12242 "unsupported format character '%c' (0x%x) "
12243 "at index %zd",
12244 (31<=c && c<=126) ? (char)c : '?',
12245 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012247 goto onError;
12248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 /* pbuf is initialized here. */
12250 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012251 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12253 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12254 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012255 len--;
12256 }
12257 else if (flags & F_SIGN)
12258 sign = '+';
12259 else if (flags & F_BLANK)
12260 sign = ' ';
12261 else
12262 sign = 0;
12263 }
12264 if (width < len)
12265 width = len;
12266 if (rescnt - (sign != 0) < width) {
12267 reslen -= rescnt;
12268 rescnt = width + fmtcnt + 100;
12269 reslen += rescnt;
12270 if (reslen < 0) {
12271 Py_XDECREF(temp);
12272 PyErr_NoMemory();
12273 goto onError;
12274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012275 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12276 if (res0 == 0) {
12277 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012278 Py_XDECREF(temp);
12279 goto onError;
12280 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012282 }
12283 if (sign) {
12284 if (fill != ' ')
12285 *res++ = sign;
12286 rescnt--;
12287 if (width > len)
12288 width--;
12289 }
12290 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12292 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012293 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12295 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012296 }
12297 rescnt -= 2;
12298 width -= 2;
12299 if (width < 0)
12300 width = 0;
12301 len -= 2;
12302 }
12303 if (width > len && !(flags & F_LJUST)) {
12304 do {
12305 --rescnt;
12306 *res++ = fill;
12307 } while (--width > len);
12308 }
12309 if (fill == ' ') {
12310 if (sign)
12311 *res++ = sign;
12312 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12314 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12315 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12316 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012317 }
12318 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 /* Copy all characters, preserving len */
12320 len1 = len;
12321 while (len1--) {
12322 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12323 rescnt--;
12324 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012325 while (--width >= len) {
12326 --rescnt;
12327 *res++ = ' ';
12328 }
12329 if (dict && (argidx < arglen) && c != '%') {
12330 PyErr_SetString(PyExc_TypeError,
12331 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012332 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012333 goto onError;
12334 }
12335 Py_XDECREF(temp);
12336 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012337 } /* until end */
12338 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012339 PyErr_SetString(PyExc_TypeError,
12340 "not all arguments converted during string formatting");
12341 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342 }
12343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344
12345 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12346 if (*res > max)
12347 max = *res;
12348 result = PyUnicode_New(reslen - rescnt, max);
12349 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 kind = PyUnicode_KIND(result);
12352 for (res = res0; res < res0+reslen-rescnt; res++)
12353 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12354 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012356 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357 }
12358 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359 return (PyObject *)result;
12360
Benjamin Peterson29060642009-01-31 22:14:21 +000012361 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363 Py_DECREF(uformat);
12364 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012365 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366 }
12367 return NULL;
12368}
12369
Jeremy Hylton938ace62002-07-17 16:30:39 +000012370static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012371unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12372
Tim Peters6d6c1a32001-08-02 04:15:00 +000012373static PyObject *
12374unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12375{
Benjamin Peterson29060642009-01-31 22:14:21 +000012376 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012377 static char *kwlist[] = {"object", "encoding", "errors", 0};
12378 char *encoding = NULL;
12379 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012380
Benjamin Peterson14339b62009-01-31 16:36:08 +000012381 if (type != &PyUnicode_Type)
12382 return unicode_subtype_new(type, args, kwds);
12383 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012384 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012385 return NULL;
12386 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012388 if (encoding == NULL && errors == NULL)
12389 return PyObject_Str(x);
12390 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012391 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012392}
12393
Guido van Rossume023fe02001-08-30 03:12:59 +000012394static PyObject *
12395unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12396{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012397 PyUnicodeObject *tmp, *pnew;
12398 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012400
Benjamin Peterson14339b62009-01-31 16:36:08 +000012401 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12402 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12403 if (tmp == NULL)
12404 return NULL;
12405 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12407 // it seems kind of strange that tp_alloc gets passed the size
12408 // of the unicode string because there will follow another
12409 // malloc.
12410 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12411 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012412 if (pnew == NULL) {
12413 Py_DECREF(tmp);
12414 return NULL;
12415 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12417 if (_PyUnicode_WSTR(pnew) == NULL) {
12418 err = PyErr_NoMemory();
12419 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12422 _PyUnicode_WSTR_LENGTH(pnew) = n;
12423 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12424 _PyUnicode_STATE(pnew).interned = 0;
12425 _PyUnicode_STATE(pnew).kind = 0;
12426 _PyUnicode_STATE(pnew).compact = 0;
12427 _PyUnicode_STATE(pnew).ready = 0;
12428 _PyUnicode_STATE(pnew).ascii = 0;
12429 pnew->data.any = NULL;
12430 _PyUnicode_LENGTH(pnew) = 0;
12431 pnew->_base.utf8 = NULL;
12432 pnew->_base.utf8_length = 0;
12433
12434 if (PyUnicode_READY(pnew) == -1) {
12435 PyObject_FREE(_PyUnicode_WSTR(pnew));
12436 goto onError;
12437 }
12438
Benjamin Peterson14339b62009-01-31 16:36:08 +000012439 Py_DECREF(tmp);
12440 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441
12442 onError:
12443 _Py_ForgetReference((PyObject *)pnew);
12444 PyObject_Del(pnew);
12445 Py_DECREF(tmp);
12446 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012447}
12448
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012449PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012450 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012451\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012452Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012453encoding defaults to the current default string encoding.\n\
12454errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012455
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012456static PyObject *unicode_iter(PyObject *seq);
12457
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012459 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012460 "str", /* tp_name */
12461 sizeof(PyUnicodeObject), /* tp_size */
12462 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 (destructor)unicode_dealloc, /* tp_dealloc */
12465 0, /* tp_print */
12466 0, /* tp_getattr */
12467 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012468 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012469 unicode_repr, /* tp_repr */
12470 &unicode_as_number, /* tp_as_number */
12471 &unicode_as_sequence, /* tp_as_sequence */
12472 &unicode_as_mapping, /* tp_as_mapping */
12473 (hashfunc) unicode_hash, /* tp_hash*/
12474 0, /* tp_call*/
12475 (reprfunc) unicode_str, /* tp_str */
12476 PyObject_GenericGetAttr, /* tp_getattro */
12477 0, /* tp_setattro */
12478 0, /* tp_as_buffer */
12479 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012480 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012481 unicode_doc, /* tp_doc */
12482 0, /* tp_traverse */
12483 0, /* tp_clear */
12484 PyUnicode_RichCompare, /* tp_richcompare */
12485 0, /* tp_weaklistoffset */
12486 unicode_iter, /* tp_iter */
12487 0, /* tp_iternext */
12488 unicode_methods, /* tp_methods */
12489 0, /* tp_members */
12490 0, /* tp_getset */
12491 &PyBaseObject_Type, /* tp_base */
12492 0, /* tp_dict */
12493 0, /* tp_descr_get */
12494 0, /* tp_descr_set */
12495 0, /* tp_dictoffset */
12496 0, /* tp_init */
12497 0, /* tp_alloc */
12498 unicode_new, /* tp_new */
12499 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500};
12501
12502/* Initialize the Unicode implementation */
12503
Thomas Wouters78890102000-07-22 19:25:51 +000012504void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012506 int i;
12507
Thomas Wouters477c8d52006-05-27 19:21:47 +000012508 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012510 0x000A, /* LINE FEED */
12511 0x000D, /* CARRIAGE RETURN */
12512 0x001C, /* FILE SEPARATOR */
12513 0x001D, /* GROUP SEPARATOR */
12514 0x001E, /* RECORD SEPARATOR */
12515 0x0085, /* NEXT LINE */
12516 0x2028, /* LINE SEPARATOR */
12517 0x2029, /* PARAGRAPH SEPARATOR */
12518 };
12519
Fred Drakee4315f52000-05-09 19:53:39 +000012520 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012522 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012524
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012525 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012526 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012527 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012528 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012529
12530 /* initialize the linebreak bloom filter */
12531 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012533 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012534
12535 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536}
12537
12538/* Finalize the Unicode implementation */
12539
Christian Heimesa156e092008-02-16 07:38:31 +000012540int
12541PyUnicode_ClearFreeList(void)
12542{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012544}
12545
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546void
Thomas Wouters78890102000-07-22 19:25:51 +000012547_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012549 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012551 Py_XDECREF(unicode_empty);
12552 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012553
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012554 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012555 if (unicode_latin1[i]) {
12556 Py_DECREF(unicode_latin1[i]);
12557 unicode_latin1[i] = NULL;
12558 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012559 }
Christian Heimesa156e092008-02-16 07:38:31 +000012560 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012562
Walter Dörwald16807132007-05-25 13:52:07 +000012563void
12564PyUnicode_InternInPlace(PyObject **p)
12565{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012566 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12567 PyObject *t;
12568 if (s == NULL || !PyUnicode_Check(s))
12569 Py_FatalError(
12570 "PyUnicode_InternInPlace: unicode strings only please!");
12571 /* If it's a subclass, we don't really know what putting
12572 it in the interned dict might do. */
12573 if (!PyUnicode_CheckExact(s))
12574 return;
12575 if (PyUnicode_CHECK_INTERNED(s))
12576 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 if (PyUnicode_READY(s) == -1) {
12578 assert(0 && "ready fail in intern...");
12579 return;
12580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012581 if (interned == NULL) {
12582 interned = PyDict_New();
12583 if (interned == NULL) {
12584 PyErr_Clear(); /* Don't leave an exception */
12585 return;
12586 }
12587 }
12588 /* It might be that the GetItem call fails even
12589 though the key is present in the dictionary,
12590 namely when this happens during a stack overflow. */
12591 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012593 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012594
Benjamin Peterson29060642009-01-31 22:14:21 +000012595 if (t) {
12596 Py_INCREF(t);
12597 Py_DECREF(*p);
12598 *p = t;
12599 return;
12600 }
Walter Dörwald16807132007-05-25 13:52:07 +000012601
Benjamin Peterson14339b62009-01-31 16:36:08 +000012602 PyThreadState_GET()->recursion_critical = 1;
12603 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12604 PyErr_Clear();
12605 PyThreadState_GET()->recursion_critical = 0;
12606 return;
12607 }
12608 PyThreadState_GET()->recursion_critical = 0;
12609 /* The two references in interned are not counted by refcnt.
12610 The deallocator will take care of this */
12611 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012613}
12614
12615void
12616PyUnicode_InternImmortal(PyObject **p)
12617{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12619
Benjamin Peterson14339b62009-01-31 16:36:08 +000012620 PyUnicode_InternInPlace(p);
12621 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012623 Py_INCREF(*p);
12624 }
Walter Dörwald16807132007-05-25 13:52:07 +000012625}
12626
12627PyObject *
12628PyUnicode_InternFromString(const char *cp)
12629{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012630 PyObject *s = PyUnicode_FromString(cp);
12631 if (s == NULL)
12632 return NULL;
12633 PyUnicode_InternInPlace(&s);
12634 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012635}
12636
Alexander Belopolsky40018472011-02-26 01:02:56 +000012637void
12638_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012639{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012640 PyObject *keys;
12641 PyUnicodeObject *s;
12642 Py_ssize_t i, n;
12643 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012644
Benjamin Peterson14339b62009-01-31 16:36:08 +000012645 if (interned == NULL || !PyDict_Check(interned))
12646 return;
12647 keys = PyDict_Keys(interned);
12648 if (keys == NULL || !PyList_Check(keys)) {
12649 PyErr_Clear();
12650 return;
12651 }
Walter Dörwald16807132007-05-25 13:52:07 +000012652
Benjamin Peterson14339b62009-01-31 16:36:08 +000012653 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12654 detector, interned unicode strings are not forcibly deallocated;
12655 rather, we give them their stolen references back, and then clear
12656 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012657
Benjamin Peterson14339b62009-01-31 16:36:08 +000012658 n = PyList_GET_SIZE(keys);
12659 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012661 for (i = 0; i < n; i++) {
12662 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 if (PyUnicode_READY(s) == -1)
12664 fprintf(stderr, "could not ready string\n");
12665 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012666 case SSTATE_NOT_INTERNED:
12667 /* XXX Shouldn't happen */
12668 break;
12669 case SSTATE_INTERNED_IMMORTAL:
12670 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012672 break;
12673 case SSTATE_INTERNED_MORTAL:
12674 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012676 break;
12677 default:
12678 Py_FatalError("Inconsistent interned string state.");
12679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012681 }
12682 fprintf(stderr, "total size of all interned strings: "
12683 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12684 "mortal/immortal\n", mortal_size, immortal_size);
12685 Py_DECREF(keys);
12686 PyDict_Clear(interned);
12687 Py_DECREF(interned);
12688 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012689}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012690
12691
12692/********************* Unicode Iterator **************************/
12693
12694typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012695 PyObject_HEAD
12696 Py_ssize_t it_index;
12697 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012698} unicodeiterobject;
12699
12700static void
12701unicodeiter_dealloc(unicodeiterobject *it)
12702{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012703 _PyObject_GC_UNTRACK(it);
12704 Py_XDECREF(it->it_seq);
12705 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012706}
12707
12708static int
12709unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12710{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012711 Py_VISIT(it->it_seq);
12712 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012713}
12714
12715static PyObject *
12716unicodeiter_next(unicodeiterobject *it)
12717{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012718 PyUnicodeObject *seq;
12719 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012720
Benjamin Peterson14339b62009-01-31 16:36:08 +000012721 assert(it != NULL);
12722 seq = it->it_seq;
12723 if (seq == NULL)
12724 return NULL;
12725 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012727 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12728 int kind = PyUnicode_KIND(seq);
12729 void *data = PyUnicode_DATA(seq);
12730 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12731 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012732 if (item != NULL)
12733 ++it->it_index;
12734 return item;
12735 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012736
Benjamin Peterson14339b62009-01-31 16:36:08 +000012737 Py_DECREF(seq);
12738 it->it_seq = NULL;
12739 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012740}
12741
12742static PyObject *
12743unicodeiter_len(unicodeiterobject *it)
12744{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012745 Py_ssize_t len = 0;
12746 if (it->it_seq)
12747 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12748 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012749}
12750
12751PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12752
12753static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012754 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012755 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012756 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012757};
12758
12759PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012760 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12761 "str_iterator", /* tp_name */
12762 sizeof(unicodeiterobject), /* tp_basicsize */
12763 0, /* tp_itemsize */
12764 /* methods */
12765 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12766 0, /* tp_print */
12767 0, /* tp_getattr */
12768 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012769 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012770 0, /* tp_repr */
12771 0, /* tp_as_number */
12772 0, /* tp_as_sequence */
12773 0, /* tp_as_mapping */
12774 0, /* tp_hash */
12775 0, /* tp_call */
12776 0, /* tp_str */
12777 PyObject_GenericGetAttr, /* tp_getattro */
12778 0, /* tp_setattro */
12779 0, /* tp_as_buffer */
12780 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12781 0, /* tp_doc */
12782 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12783 0, /* tp_clear */
12784 0, /* tp_richcompare */
12785 0, /* tp_weaklistoffset */
12786 PyObject_SelfIter, /* tp_iter */
12787 (iternextfunc)unicodeiter_next, /* tp_iternext */
12788 unicodeiter_methods, /* tp_methods */
12789 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012790};
12791
12792static PyObject *
12793unicode_iter(PyObject *seq)
12794{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012795 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012796
Benjamin Peterson14339b62009-01-31 16:36:08 +000012797 if (!PyUnicode_Check(seq)) {
12798 PyErr_BadInternalCall();
12799 return NULL;
12800 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012801 if (PyUnicode_READY(seq) == -1)
12802 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012803 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12804 if (it == NULL)
12805 return NULL;
12806 it->it_index = 0;
12807 Py_INCREF(seq);
12808 it->it_seq = (PyUnicodeObject *)seq;
12809 _PyObject_GC_TRACK(it);
12810 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012811}
12812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813#define UNIOP(x) Py_UNICODE_##x
12814#define UNIOP_t Py_UNICODE
12815#include "uniops.h"
12816#undef UNIOP
12817#undef UNIOP_t
12818#define UNIOP(x) Py_UCS4_##x
12819#define UNIOP_t Py_UCS4
12820#include "uniops.h"
12821#undef UNIOP
12822#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012823
Victor Stinner71133ff2010-09-01 23:43:53 +000012824Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012825PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012826{
12827 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12828 Py_UNICODE *copy;
12829 Py_ssize_t size;
12830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831 if (!PyUnicode_Check(unicode)) {
12832 PyErr_BadArgument();
12833 return NULL;
12834 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012835 /* Ensure we won't overflow the size. */
12836 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12837 PyErr_NoMemory();
12838 return NULL;
12839 }
12840 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12841 size *= sizeof(Py_UNICODE);
12842 copy = PyMem_Malloc(size);
12843 if (copy == NULL) {
12844 PyErr_NoMemory();
12845 return NULL;
12846 }
12847 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12848 return copy;
12849}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012850
Georg Brandl66c221e2010-10-14 07:04:07 +000012851/* A _string module, to export formatter_parser and formatter_field_name_split
12852 to the string.Formatter class implemented in Python. */
12853
12854static PyMethodDef _string_methods[] = {
12855 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12856 METH_O, PyDoc_STR("split the argument as a field name")},
12857 {"formatter_parser", (PyCFunction) formatter_parser,
12858 METH_O, PyDoc_STR("parse the argument as a format string")},
12859 {NULL, NULL}
12860};
12861
12862static struct PyModuleDef _string_module = {
12863 PyModuleDef_HEAD_INIT,
12864 "_string",
12865 PyDoc_STR("string helper module"),
12866 0,
12867 _string_methods,
12868 NULL,
12869 NULL,
12870 NULL,
12871 NULL
12872};
12873
12874PyMODINIT_FUNC
12875PyInit__string(void)
12876{
12877 return PyModule_Create(&_string_module);
12878}
12879
12880
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012881#ifdef __cplusplus
12882}
12883#endif