blob: 05d572b08bbb912dfbc4499d135fb91fd70b4051 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200108#define _PyUnicode_UTF8(op) \
109 (PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 ((PyCompactUnicodeObject*)(op))->utf8)
112#define _PyUnicode_UTF8_LENGTH(op) \
113 (PyUnicode_IS_COMPACT_ASCII(op) ? \
114 ((PyASCIIObject*)(op))->length : \
115 ((PyCompactUnicodeObject*)(op))->utf8_length)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200116#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
117#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
119#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
120#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
121#define _PyUnicode_KIND(op) \
122 (assert(PyUnicode_Check(op)), \
123 ((PyASCIIObject *)(op))->state.kind)
124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(PyUnicode_Check(op)), \
126 ((PyASCIIObject *)(op))->length)
127
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200128/* The Unicode string has been modified: reset the hash */
129#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131
Walter Dörwald16807132007-05-25 13:52:07 +0000132/* This dictionary holds all interned unicode strings. Note that references
133 to strings in this dictionary are *not* counted in the string's ob_refcnt.
134 When the interned string reaches a refcnt of 0 the string deallocation
135 function will delete the reference from this dictionary.
136
137 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000138 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000139*/
140static PyObject *interned;
141
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000142/* The empty Unicode object is shared to improve performance. */
143static PyUnicodeObject *unicode_empty;
144
145/* Single character Unicode strings in the Latin-1 range are being
146 shared as well. */
147static PyUnicodeObject *unicode_latin1[256];
148
Christian Heimes190d79e2008-01-30 11:58:22 +0000149/* Fast detection of the most frequent whitespace characters */
150const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000151 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000152/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000154/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000155/* case 0x000C: * FORM FEED */
156/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000157 0, 1, 1, 1, 1, 1, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* case 0x001C: * FILE SEPARATOR */
160/* case 0x001D: * GROUP SEPARATOR */
161/* case 0x001E: * RECORD SEPARATOR */
162/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000163 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000165 1, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000169
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000178};
179
Alexander Belopolsky40018472011-02-26 01:02:56 +0000180static PyObject *
181unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000182 PyObject **errorHandler,const char *encoding, const char *reason,
183 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
184 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
185
Alexander Belopolsky40018472011-02-26 01:02:56 +0000186static void
187raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300188 const char *encoding,
189 const Py_UNICODE *unicode, Py_ssize_t size,
190 Py_ssize_t startpos, Py_ssize_t endpos,
191 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000192
Christian Heimes190d79e2008-01-30 11:58:22 +0000193/* Same for linebreaks */
194static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* 0x000B, * LINE TABULATION */
198/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000202/* 0x001C, * FILE SEPARATOR */
203/* 0x001D, * GROUP SEPARATOR */
204/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000210
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000219};
220
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300221/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
222 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000223Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000224PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000225{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000226#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000228#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 /* This is actually an illegal character, so it should
230 not be passed to unichr. */
231 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232#endif
233}
234
Thomas Wouters477c8d52006-05-27 19:21:47 +0000235/* --- Bloom Filters ----------------------------------------------------- */
236
237/* stuff to implement simple "bloom filters" for Unicode characters.
238 to keep things simple, we use a single bitmask, using the least 5
239 bits from each unicode characters as the bit index. */
240
241/* the linebreak mask is set up by Unicode_Init below */
242
Antoine Pitrouf068f942010-01-13 14:19:12 +0000243#if LONG_BIT >= 128
244#define BLOOM_WIDTH 128
245#elif LONG_BIT >= 64
246#define BLOOM_WIDTH 64
247#elif LONG_BIT >= 32
248#define BLOOM_WIDTH 32
249#else
250#error "LONG_BIT is smaller than 32"
251#endif
252
Thomas Wouters477c8d52006-05-27 19:21:47 +0000253#define BLOOM_MASK unsigned long
254
255static BLOOM_MASK bloom_linebreak;
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
258#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Benjamin Peterson29060642009-01-31 22:14:21 +0000260#define BLOOM_LINEBREAK(ch) \
261 ((ch) < 128U ? ascii_linebreak[(ch)] : \
262 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
Alexander Belopolsky40018472011-02-26 01:02:56 +0000264Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200265make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266{
267 /* calculate simple bloom-style bitmask for a given unicode string */
268
Antoine Pitrouf068f942010-01-13 14:19:12 +0000269 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270 Py_ssize_t i;
271
272 mask = 0;
273 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200274 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000275
276 return mask;
277}
278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279#define BLOOM_MEMBER(mask, chr, str) \
280 (BLOOM(mask, chr) \
281 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283/* --- Unicode Object ----------------------------------------------------- */
284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200286fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
287
288Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
289 Py_ssize_t size, Py_UCS4 ch,
290 int direction)
291{
292 /* like wcschr, but doesn't stop at NULL characters */
293 Py_ssize_t i;
294 if (direction == 1) {
295 for(i = 0; i < size; i++)
296 if (PyUnicode_READ(kind, s, i) == ch)
297 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
298 }
299 else {
300 for(i = size-1; i >= 0; i--)
301 if (PyUnicode_READ(kind, s, i) == ch)
302 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
303 }
304 return NULL;
305}
306
Alexander Belopolsky40018472011-02-26 01:02:56 +0000307static int
308unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200309 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310{
311 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200313 /* Resizing is only supported for old unicode objects. */
314 assert(!PyUnicode_IS_COMPACT(unicode));
315 assert(_PyUnicode_WSTR(unicode) != NULL);
316
317 /* ... and only if they have not been readied yet, because
318 callees usually rely on the wstr representation when resizing. */
319 assert(unicode->data.any == NULL);
320
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000321 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200322 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000325 /* Resizing shared object (unicode_empty or single character
326 objects) in-place is not allowed. Use PyUnicode_Resize()
327 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000328
Benjamin Peterson14339b62009-01-31 16:36:08 +0000329 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200330 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
331 _PyUnicode_WSTR(unicode)[0] < 256U &&
332 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000334 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000335 return -1;
336 }
337
Thomas Wouters477c8d52006-05-27 19:21:47 +0000338 /* We allocate one more byte to make sure the string is Ux0000 terminated.
339 The overallocation is also used by fastsearch, which assumes that it's
340 safe to look at str[length] (without making any assumptions about what
341 it contains). */
342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200343 oldstr = _PyUnicode_WSTR(unicode);
344 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
345 sizeof(Py_UNICODE) * (length + 1));
346 if (!_PyUnicode_WSTR(unicode)) {
347 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 PyErr_NoMemory();
349 return -1;
350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200351 _PyUnicode_WSTR(unicode)[length] = 0;
352 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200355 if (unicode->data.any != NULL) {
356 PyObject_FREE(unicode->data.any);
357 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
358 PyObject_FREE(unicode->_base.utf8);
359 }
360 unicode->_base.utf8 = NULL;
361 unicode->_base.utf8_length = 0;
362 unicode->data.any = NULL;
363 _PyUnicode_LENGTH(unicode) = 0;
364 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
365 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200367 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000368
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 return 0;
370}
371
372/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000373 Ux0000 terminated; some code (e.g. new_identifier)
374 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375
376 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000377 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378
379*/
380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200381#ifdef Py_DEBUG
382int unicode_old_new_calls = 0;
383#endif
384
Alexander Belopolsky40018472011-02-26 01:02:56 +0000385static PyUnicodeObject *
386_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387{
388 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200389 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000390
Thomas Wouters477c8d52006-05-27 19:21:47 +0000391 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392 if (length == 0 && unicode_empty != NULL) {
393 Py_INCREF(unicode_empty);
394 return unicode_empty;
395 }
396
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000397 /* Ensure we won't overflow the size. */
398 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
399 return (PyUnicodeObject *)PyErr_NoMemory();
400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200401 if (length < 0) {
402 PyErr_SetString(PyExc_SystemError,
403 "Negative size passed to _PyUnicode_New");
404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405 }
406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200407#ifdef Py_DEBUG
408 ++unicode_old_new_calls;
409#endif
410
411 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
412 if (unicode == NULL)
413 return NULL;
414 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
415 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
416 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 PyErr_NoMemory();
418 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200420
Jeremy Hyltond8082792003-09-16 19:41:39 +0000421 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000422 * the caller fails before initializing str -- unicode_resize()
423 * reads str[0], and the Keep-Alive optimization can keep memory
424 * allocated for str alive across a call to unicode_dealloc(unicode).
425 * We don't want unicode_resize to read uninitialized memory in
426 * that case.
427 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200428 _PyUnicode_WSTR(unicode)[0] = 0;
429 _PyUnicode_WSTR(unicode)[length] = 0;
430 _PyUnicode_WSTR_LENGTH(unicode) = length;
431 _PyUnicode_HASH(unicode) = -1;
432 _PyUnicode_STATE(unicode).interned = 0;
433 _PyUnicode_STATE(unicode).kind = 0;
434 _PyUnicode_STATE(unicode).compact = 0;
435 _PyUnicode_STATE(unicode).ready = 0;
436 _PyUnicode_STATE(unicode).ascii = 0;
437 unicode->data.any = NULL;
438 _PyUnicode_LENGTH(unicode) = 0;
439 unicode->_base.utf8 = NULL;
440 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000442
Benjamin Peterson29060642009-01-31 22:14:21 +0000443 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000444 /* XXX UNREF/NEWREF interface should be more symmetrical */
445 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000446 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000447 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000449}
450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200451#ifdef Py_DEBUG
452int unicode_new_new_calls = 0;
453
454/* Functions wrapping macros for use in debugger */
455char *_PyUnicode_utf8(void *unicode){
456 return _PyUnicode_UTF8(unicode);
457}
458
459void *_PyUnicode_compact_data(void *unicode) {
460 return _PyUnicode_COMPACT_DATA(unicode);
461}
462void *_PyUnicode_data(void *unicode){
463 printf("obj %p\n", unicode);
464 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
465 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
466 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
467 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
468 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
469 return PyUnicode_DATA(unicode);
470}
471#endif
472
473PyObject *
474PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
475{
476 PyObject *obj;
477 PyCompactUnicodeObject *unicode;
478 void *data;
479 int kind_state;
480 int is_sharing = 0, is_ascii = 0;
481 Py_ssize_t char_size;
482 Py_ssize_t struct_size;
483
484 /* Optimization for empty strings */
485 if (size == 0 && unicode_empty != NULL) {
486 Py_INCREF(unicode_empty);
487 return (PyObject *)unicode_empty;
488 }
489
490#ifdef Py_DEBUG
491 ++unicode_new_new_calls;
492#endif
493
494 struct_size = sizeof(PyCompactUnicodeObject);
495 if (maxchar < 128) {
496 kind_state = PyUnicode_1BYTE_KIND;
497 char_size = 1;
498 is_ascii = 1;
499 struct_size = sizeof(PyASCIIObject);
500 }
501 else if (maxchar < 256) {
502 kind_state = PyUnicode_1BYTE_KIND;
503 char_size = 1;
504 }
505 else if (maxchar < 65536) {
506 kind_state = PyUnicode_2BYTE_KIND;
507 char_size = 2;
508 if (sizeof(wchar_t) == 2)
509 is_sharing = 1;
510 }
511 else {
512 kind_state = PyUnicode_4BYTE_KIND;
513 char_size = 4;
514 if (sizeof(wchar_t) == 4)
515 is_sharing = 1;
516 }
517
518 /* Ensure we won't overflow the size. */
519 if (size < 0) {
520 PyErr_SetString(PyExc_SystemError,
521 "Negative size passed to PyUnicode_New");
522 return NULL;
523 }
524 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
525 return PyErr_NoMemory();
526
527 /* Duplicated allocation code from _PyObject_New() instead of a call to
528 * PyObject_New() so we are able to allocate space for the object and
529 * it's data buffer.
530 */
531 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
532 if (obj == NULL)
533 return PyErr_NoMemory();
534 obj = PyObject_INIT(obj, &PyUnicode_Type);
535 if (obj == NULL)
536 return NULL;
537
538 unicode = (PyCompactUnicodeObject *)obj;
539 if (is_ascii)
540 data = ((PyASCIIObject*)obj) + 1;
541 else
542 data = unicode + 1;
543 _PyUnicode_LENGTH(unicode) = size;
544 _PyUnicode_HASH(unicode) = -1;
545 _PyUnicode_STATE(unicode).interned = 0;
546 _PyUnicode_STATE(unicode).kind = kind_state;
547 _PyUnicode_STATE(unicode).compact = 1;
548 _PyUnicode_STATE(unicode).ready = 1;
549 _PyUnicode_STATE(unicode).ascii = is_ascii;
550 if (is_ascii) {
551 ((char*)data)[size] = 0;
552 _PyUnicode_WSTR(unicode) = NULL;
553 }
554 else if (kind_state == PyUnicode_1BYTE_KIND) {
555 ((char*)data)[size] = 0;
556 _PyUnicode_WSTR(unicode) = NULL;
557 _PyUnicode_WSTR_LENGTH(unicode) = 0;
558 unicode->utf8_length = 0;
559 unicode->utf8 = NULL;
560 }
561 else {
562 unicode->utf8 = NULL;
563 if (kind_state == PyUnicode_2BYTE_KIND)
564 ((Py_UCS2*)data)[size] = 0;
565 else /* kind_state == PyUnicode_4BYTE_KIND */
566 ((Py_UCS4*)data)[size] = 0;
567 if (is_sharing) {
568 _PyUnicode_WSTR_LENGTH(unicode) = size;
569 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
570 }
571 else {
572 _PyUnicode_WSTR_LENGTH(unicode) = 0;
573 _PyUnicode_WSTR(unicode) = NULL;
574 }
575 }
576 return obj;
577}
578
579#if SIZEOF_WCHAR_T == 2
580/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
581 will decode surrogate pairs, the other conversions are implemented as macros
582 for efficency.
583
584 This function assumes that unicode can hold one more code point than wstr
585 characters for a terminating null character. */
586static int
587unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
588 PyUnicodeObject *unicode)
589{
590 const wchar_t *iter;
591 Py_UCS4 *ucs4_out;
592
593 assert(unicode && PyUnicode_Check(unicode));
594 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
595 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
596
597 for (iter = begin; iter < end; ) {
598 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
599 _PyUnicode_GET_LENGTH(unicode)));
600 if (*iter >= 0xD800 && *iter <= 0xDBFF
601 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
602 {
603 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
604 iter += 2;
605 }
606 else {
607 *ucs4_out++ = *iter;
608 iter++;
609 }
610 }
611 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
612 _PyUnicode_GET_LENGTH(unicode)));
613
614 return 0;
615}
616#endif
617
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200618Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200619PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
620 PyObject *from, Py_ssize_t from_start,
621 Py_ssize_t how_many)
622{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200623 unsigned int from_kind, to_kind;
624 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200625
Victor Stinnerb1536152011-09-30 02:26:10 +0200626 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
627 PyErr_BadInternalCall();
628 return -1;
629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200630
631 if (PyUnicode_READY(from))
632 return -1;
633 if (PyUnicode_READY(to))
634 return -1;
635
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200636 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200637 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
638 PyErr_Format(PyExc_ValueError,
639 "Cannot write %zi characters at %zi "
640 "in a string of %zi characters",
641 how_many, to_start, PyUnicode_GET_LENGTH(to));
642 return -1;
643 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200644 if (how_many == 0)
645 return 0;
646
647 if (Py_REFCNT(to) != 1) {
648 PyErr_SetString(PyExc_ValueError,
649 "Cannot modify a string having more than 1 reference");
650 return -1;
651 }
Victor Stinnerc17f5402011-09-29 00:16:58 +0200652 _PyUnicode_DIRTY(to);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200655 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200656 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200657 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200658
659 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200660 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200661 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200662 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200663 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200664 + PyUnicode_KIND_SIZE(from_kind, from_start),
665 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200666 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200667 else if (from_kind == PyUnicode_1BYTE_KIND
668 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200669 {
670 _PyUnicode_CONVERT_BYTES(
671 Py_UCS1, Py_UCS2,
672 PyUnicode_1BYTE_DATA(from) + from_start,
673 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
674 PyUnicode_2BYTE_DATA(to) + to_start
675 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200676 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200677 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200678 && to_kind == PyUnicode_4BYTE_KIND)
679 {
680 _PyUnicode_CONVERT_BYTES(
681 Py_UCS1, Py_UCS4,
682 PyUnicode_1BYTE_DATA(from) + from_start,
683 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
684 PyUnicode_4BYTE_DATA(to) + to_start
685 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200686 }
687 else if (from_kind == PyUnicode_2BYTE_KIND
688 && to_kind == PyUnicode_4BYTE_KIND)
689 {
690 _PyUnicode_CONVERT_BYTES(
691 Py_UCS2, Py_UCS4,
692 PyUnicode_2BYTE_DATA(from) + from_start,
693 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
694 PyUnicode_4BYTE_DATA(to) + to_start
695 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200696 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200697 else {
698 int invalid_kinds;
699 if (from_kind > to_kind) {
700 /* slow path to check for character overflow */
701 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
702 Py_UCS4 ch, maxchar;
703 Py_ssize_t i;
704
705 maxchar = 0;
706 invalid_kinds = 0;
707 for (i=0; i < how_many; i++) {
708 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
709 if (ch > maxchar) {
710 maxchar = ch;
711 if (maxchar > to_maxchar) {
712 invalid_kinds = 1;
713 break;
714 }
715 }
716 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
717 }
718 }
719 else
720 invalid_kinds = 1;
721 if (invalid_kinds) {
722 PyErr_Format(PyExc_ValueError,
723 "Cannot copy UCS%u characters "
724 "into a string of UCS%u characters",
725 1 << (from_kind - 1),
726 1 << (to_kind -1));
727 return -1;
728 }
729 }
730 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200731}
732
Victor Stinner17222162011-09-28 22:15:37 +0200733/* Find the maximum code point and count the number of surrogate pairs so a
734 correct string length can be computed before converting a string to UCS4.
735 This function counts single surrogates as a character and not as a pair.
736
737 Return 0 on success, or -1 on error. */
738static int
739find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
740 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200741{
742 const wchar_t *iter;
743
744 if (num_surrogates == NULL || maxchar == NULL) {
745 PyErr_SetString(PyExc_SystemError,
746 "unexpected NULL arguments to "
747 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
748 return -1;
749 }
750
751 *num_surrogates = 0;
752 *maxchar = 0;
753
754 for (iter = begin; iter < end; ) {
755 if (*iter > *maxchar)
756 *maxchar = *iter;
757#if SIZEOF_WCHAR_T == 2
758 if (*iter >= 0xD800 && *iter <= 0xDBFF
759 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
760 {
761 Py_UCS4 surrogate_val;
762 surrogate_val = (((iter[0] & 0x3FF)<<10)
763 | (iter[1] & 0x3FF)) + 0x10000;
764 ++(*num_surrogates);
765 if (surrogate_val > *maxchar)
766 *maxchar = surrogate_val;
767 iter += 2;
768 }
769 else
770 iter++;
771#else
772 iter++;
773#endif
774 }
775 return 0;
776}
777
778#ifdef Py_DEBUG
779int unicode_ready_calls = 0;
780#endif
781
782int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200783_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200784{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200785 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200786 wchar_t *end;
787 Py_UCS4 maxchar = 0;
788 Py_ssize_t num_surrogates;
789#if SIZEOF_WCHAR_T == 2
790 Py_ssize_t length_wo_surrogates;
791#endif
792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200793 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200794 strings were created using _PyObject_New() and where no canonical
795 representation (the str field) has been set yet aka strings
796 which are not yet ready. */
797 assert(PyUnicode_Check(obj));
798 assert(!PyUnicode_IS_READY(obj));
799 assert(!PyUnicode_IS_COMPACT(obj));
800 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200802 assert(unicode->data.any == NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803 assert(unicode->_base.utf8 == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200804 /* Actually, it should neither be interned nor be anything else: */
805 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200806
807#ifdef Py_DEBUG
808 ++unicode_ready_calls;
809#endif
810
811 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200812 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200813 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200814 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200815
816 if (maxchar < 256) {
817 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
818 if (!unicode->data.any) {
819 PyErr_NoMemory();
820 return -1;
821 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200822 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200823 _PyUnicode_WSTR(unicode), end,
824 PyUnicode_1BYTE_DATA(unicode));
825 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
826 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
827 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
828 if (maxchar < 128) {
829 unicode->_base.utf8 = unicode->data.any;
830 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
831 }
832 else {
833 unicode->_base.utf8 = NULL;
834 unicode->_base.utf8_length = 0;
835 }
836 PyObject_FREE(_PyUnicode_WSTR(unicode));
837 _PyUnicode_WSTR(unicode) = NULL;
838 _PyUnicode_WSTR_LENGTH(unicode) = 0;
839 }
840 /* In this case we might have to convert down from 4-byte native
841 wchar_t to 2-byte unicode. */
842 else if (maxchar < 65536) {
843 assert(num_surrogates == 0 &&
844 "FindMaxCharAndNumSurrogatePairs() messed up");
845
Victor Stinner506f5922011-09-28 22:34:18 +0200846#if SIZEOF_WCHAR_T == 2
847 /* We can share representations and are done. */
848 unicode->data.any = _PyUnicode_WSTR(unicode);
849 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
850 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
851 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
852 unicode->_base.utf8 = NULL;
853 unicode->_base.utf8_length = 0;
854#else
855 /* sizeof(wchar_t) == 4 */
856 unicode->data.any = PyObject_MALLOC(
857 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
858 if (!unicode->data.any) {
859 PyErr_NoMemory();
860 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 }
Victor Stinner506f5922011-09-28 22:34:18 +0200862 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
863 _PyUnicode_WSTR(unicode), end,
864 PyUnicode_2BYTE_DATA(unicode));
865 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
866 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
867 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
868 unicode->_base.utf8 = NULL;
869 unicode->_base.utf8_length = 0;
870 PyObject_FREE(_PyUnicode_WSTR(unicode));
871 _PyUnicode_WSTR(unicode) = NULL;
872 _PyUnicode_WSTR_LENGTH(unicode) = 0;
873#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874 }
875 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
876 else {
877#if SIZEOF_WCHAR_T == 2
878 /* in case the native representation is 2-bytes, we need to allocate a
879 new normalized 4-byte version. */
880 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
881 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
882 if (!unicode->data.any) {
883 PyErr_NoMemory();
884 return -1;
885 }
886 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
887 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
888 unicode->_base.utf8 = NULL;
889 unicode->_base.utf8_length = 0;
890 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
891 unicode) < 0) {
892 assert(0 && "ConvertWideCharToUCS4 failed");
893 return -1;
894 }
895 PyObject_FREE(_PyUnicode_WSTR(unicode));
896 _PyUnicode_WSTR(unicode) = NULL;
897 _PyUnicode_WSTR_LENGTH(unicode) = 0;
898#else
899 assert(num_surrogates == 0);
900
901 unicode->data.any = _PyUnicode_WSTR(unicode);
902 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
903 unicode->_base.utf8 = NULL;
904 unicode->_base.utf8_length = 0;
905 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
906#endif
907 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
908 }
909 _PyUnicode_STATE(unicode).ready = 1;
910 return 0;
911}
912
Alexander Belopolsky40018472011-02-26 01:02:56 +0000913static void
914unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000915{
Walter Dörwald16807132007-05-25 13:52:07 +0000916 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000917 case SSTATE_NOT_INTERNED:
918 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000919
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 case SSTATE_INTERNED_MORTAL:
921 /* revive dead object temporarily for DelItem */
922 Py_REFCNT(unicode) = 3;
923 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
924 Py_FatalError(
925 "deletion of interned string failed");
926 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000927
Benjamin Peterson29060642009-01-31 22:14:21 +0000928 case SSTATE_INTERNED_IMMORTAL:
929 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000930
Benjamin Peterson29060642009-01-31 22:14:21 +0000931 default:
932 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000933 }
934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200935 if (_PyUnicode_WSTR(unicode) &&
936 (!PyUnicode_IS_READY(unicode) ||
937 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
938 PyObject_DEL(_PyUnicode_WSTR(unicode));
939 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
940 PyObject_DEL(unicode->_base.utf8);
941
942 if (PyUnicode_IS_COMPACT(unicode)) {
943 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944 }
945 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200946 if (unicode->data.any)
947 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000949 }
950}
951
Alexander Belopolsky40018472011-02-26 01:02:56 +0000952static int
953_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000954{
955 register PyUnicodeObject *v;
956
957 /* Argument checks */
958 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000959 PyErr_BadInternalCall();
960 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000961 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000962 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
964 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000965 PyErr_BadInternalCall();
966 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000967 }
968
969 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970 possible since these are being shared.
971 The same goes for new-representation unicode objects or objects which
972 have already been readied.
973 For these, we simply return a fresh copy with the same Unicode content.
974 */
975 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
976 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
977 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000978 PyUnicodeObject *w = _PyUnicode_New(length);
979 if (w == NULL)
980 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
982 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000983 Py_DECREF(*unicode);
984 *unicode = w;
985 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000986 }
987
988 /* Note that we don't have to modify *unicode for unshared Unicode
989 objects, since we can modify them in-place. */
990 return unicode_resize(v, length);
991}
992
Alexander Belopolsky40018472011-02-26 01:02:56 +0000993int
994PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000995{
996 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
997}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999static PyObject*
1000get_latin1_char(unsigned char ch)
1001{
1002 PyUnicodeObject *unicode = unicode_latin1[ch];
1003 if (!unicode) {
1004 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1005 if (!unicode)
1006 return NULL;
1007 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1008 unicode_latin1[ch] = unicode;
1009 }
1010 Py_INCREF(unicode);
1011 return (PyObject *)unicode;
1012}
1013
Alexander Belopolsky40018472011-02-26 01:02:56 +00001014PyObject *
1015PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001016{
1017 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001018 Py_UCS4 maxchar = 0;
1019 Py_ssize_t num_surrogates;
1020
1021 if (u == NULL)
1022 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001024 /* If the Unicode data is known at construction time, we can apply
1025 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027 /* Optimization for empty strings */
1028 if (size == 0 && unicode_empty != NULL) {
1029 Py_INCREF(unicode_empty);
1030 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001031 }
Tim Petersced69f82003-09-16 20:30:58 +00001032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033 /* Single character Unicode objects in the Latin-1 range are
1034 shared when using this constructor */
1035 if (size == 1 && *u < 256)
1036 return get_latin1_char((unsigned char)*u);
1037
1038 /* If not empty and not single character, copy the Unicode data
1039 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001040 if (find_maxchar_surrogates(u, u + size,
1041 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001042 return NULL;
1043
1044 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1045 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046 if (!unicode)
1047 return NULL;
1048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 switch (PyUnicode_KIND(unicode)) {
1050 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001051 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1053 break;
1054 case PyUnicode_2BYTE_KIND:
1055#if Py_UNICODE_SIZE == 2
1056 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1057#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001058 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1060#endif
1061 break;
1062 case PyUnicode_4BYTE_KIND:
1063#if SIZEOF_WCHAR_T == 2
1064 /* This is the only case which has to process surrogates, thus
1065 a simple copy loop is not enough and we need a function. */
1066 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1067 Py_DECREF(unicode);
1068 return NULL;
1069 }
1070#else
1071 assert(num_surrogates == 0);
1072 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1073#endif
1074 break;
1075 default:
1076 assert(0 && "Impossible state");
1077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079 return (PyObject *)unicode;
1080}
1081
Alexander Belopolsky40018472011-02-26 01:02:56 +00001082PyObject *
1083PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001084{
1085 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001086
Benjamin Peterson14339b62009-01-31 16:36:08 +00001087 if (size < 0) {
1088 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001089 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001090 return NULL;
1091 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001092
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001093 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001094 some optimizations which share commonly used objects.
1095 Also, this means the input must be UTF-8, so fall back to the
1096 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001097 if (u != NULL) {
1098
Benjamin Peterson29060642009-01-31 22:14:21 +00001099 /* Optimization for empty strings */
1100 if (size == 0 && unicode_empty != NULL) {
1101 Py_INCREF(unicode_empty);
1102 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001103 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001104
1105 /* Single characters are shared when using this constructor.
1106 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 if (size == 1 && Py_CHARMASK(*u) < 128)
1108 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001109
1110 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001111 }
1112
Walter Dörwald55507312007-05-18 13:12:10 +00001113 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001114 if (!unicode)
1115 return NULL;
1116
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001117 return (PyObject *)unicode;
1118}
1119
Alexander Belopolsky40018472011-02-26 01:02:56 +00001120PyObject *
1121PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001122{
1123 size_t size = strlen(u);
1124 if (size > PY_SSIZE_T_MAX) {
1125 PyErr_SetString(PyExc_OverflowError, "input too long");
1126 return NULL;
1127 }
1128
1129 return PyUnicode_FromStringAndSize(u, size);
1130}
1131
Victor Stinnere57b1c02011-09-28 22:20:48 +02001132static PyObject*
1133_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001134{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135 PyObject *res;
1136 unsigned char max = 127;
1137 Py_ssize_t i;
1138 for (i = 0; i < size; i++) {
1139 if (u[i] & 0x80) {
1140 max = 255;
1141 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001142 }
1143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 res = PyUnicode_New(size, max);
1145 if (!res)
1146 return NULL;
1147 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1148 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001149}
1150
Victor Stinnere57b1c02011-09-28 22:20:48 +02001151static PyObject*
1152_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153{
1154 PyObject *res;
1155 Py_UCS2 max = 0;
1156 Py_ssize_t i;
1157 for (i = 0; i < size; i++)
1158 if (u[i] > max)
1159 max = u[i];
1160 res = PyUnicode_New(size, max);
1161 if (!res)
1162 return NULL;
1163 if (max >= 256)
1164 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1165 else
1166 for (i = 0; i < size; i++)
1167 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1168 return res;
1169}
1170
Victor Stinnere57b1c02011-09-28 22:20:48 +02001171static PyObject*
1172_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173{
1174 PyObject *res;
1175 Py_UCS4 max = 0;
1176 Py_ssize_t i;
1177 for (i = 0; i < size; i++)
1178 if (u[i] > max)
1179 max = u[i];
1180 res = PyUnicode_New(size, max);
1181 if (!res)
1182 return NULL;
1183 if (max >= 0x10000)
1184 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1185 else {
1186 int kind = PyUnicode_KIND(res);
1187 void *data = PyUnicode_DATA(res);
1188 for (i = 0; i < size; i++)
1189 PyUnicode_WRITE(kind, data, i, u[i]);
1190 }
1191 return res;
1192}
1193
1194PyObject*
1195PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1196{
1197 switch(kind) {
1198 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001199 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001201 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001203 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204 }
1205 assert(0);
1206 return NULL;
1207}
1208
Victor Stinner034f6cf2011-09-30 02:26:44 +02001209PyObject*
1210PyUnicode_Copy(PyObject *unicode)
1211{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001212 Py_ssize_t size;
1213 PyObject *copy;
1214 void *data;
1215
Victor Stinner034f6cf2011-09-30 02:26:44 +02001216 if (!PyUnicode_Check(unicode)) {
1217 PyErr_BadInternalCall();
1218 return NULL;
1219 }
1220 if (PyUnicode_READY(unicode))
1221 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001222
1223 size = PyUnicode_GET_LENGTH(unicode);
1224 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1225 if (!copy)
1226 return NULL;
1227 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1228
1229 data = PyUnicode_DATA(unicode);
1230 switch (PyUnicode_KIND(unicode))
1231 {
1232 case PyUnicode_1BYTE_KIND:
1233 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1234 break;
1235 case PyUnicode_2BYTE_KIND:
1236 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1237 break;
1238 case PyUnicode_4BYTE_KIND:
1239 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1240 break;
1241 default:
1242 assert(0);
1243 break;
1244 }
1245 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001246}
1247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248
1249/* Widen Unicode objects to larger buffers.
1250 Return NULL if the string is too wide already. */
1251
1252void*
1253_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1254{
1255 Py_ssize_t i;
1256 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1257 void *d = PyUnicode_DATA(s);
1258 unsigned int skind = PyUnicode_KIND(s);
1259 if (PyUnicode_KIND(s) >= kind) {
1260 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1261 return NULL;
1262 }
1263 switch(kind) {
1264 case PyUnicode_2BYTE_KIND: {
1265 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1266 if (!result) {
1267 PyErr_NoMemory();
1268 return 0;
1269 }
1270 for (i = 0; i < len; i++)
1271 result[i] = ((Py_UCS1*)d)[i];
1272 return result;
1273 }
1274 case PyUnicode_4BYTE_KIND: {
1275 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1276 if (!result) {
1277 PyErr_NoMemory();
1278 return 0;
1279 }
1280 for (i = 0; i < len; i++)
1281 result[i] = PyUnicode_READ(skind, d, i);
1282 return result;
1283 }
1284 }
1285 Py_FatalError("invalid kind");
1286 return NULL;
1287}
1288
1289static Py_UCS4*
1290as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1291 int copy_null)
1292{
1293 int kind;
1294 void *data;
1295 Py_ssize_t len, targetlen;
1296 if (PyUnicode_READY(string) == -1)
1297 return NULL;
1298 kind = PyUnicode_KIND(string);
1299 data = PyUnicode_DATA(string);
1300 len = PyUnicode_GET_LENGTH(string);
1301 targetlen = len;
1302 if (copy_null)
1303 targetlen++;
1304 if (!target) {
1305 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1306 PyErr_NoMemory();
1307 return NULL;
1308 }
1309 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1310 if (!target) {
1311 PyErr_NoMemory();
1312 return NULL;
1313 }
1314 }
1315 else {
1316 if (targetsize < targetlen) {
1317 PyErr_Format(PyExc_SystemError,
1318 "string is longer than the buffer");
1319 if (copy_null && 0 < targetsize)
1320 target[0] = 0;
1321 return NULL;
1322 }
1323 }
1324 if (kind != PyUnicode_4BYTE_KIND) {
1325 Py_ssize_t i;
1326 for (i = 0; i < len; i++)
1327 target[i] = PyUnicode_READ(kind, data, i);
1328 }
1329 else
1330 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1331 if (copy_null)
1332 target[len] = 0;
1333 return target;
1334}
1335
1336Py_UCS4*
1337PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1338 int copy_null)
1339{
1340 if (target == NULL || targetsize < 1) {
1341 PyErr_BadInternalCall();
1342 return NULL;
1343 }
1344 return as_ucs4(string, target, targetsize, copy_null);
1345}
1346
1347Py_UCS4*
1348PyUnicode_AsUCS4Copy(PyObject *string)
1349{
1350 return as_ucs4(string, NULL, 0, 1);
1351}
1352
1353#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001354
Alexander Belopolsky40018472011-02-26 01:02:56 +00001355PyObject *
1356PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001359 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001361 PyErr_BadInternalCall();
1362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001363 }
1364
Martin v. Löwis790465f2008-04-05 20:41:37 +00001365 if (size == -1) {
1366 size = wcslen(w);
1367 }
1368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370}
1371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001373
Walter Dörwald346737f2007-05-31 10:44:43 +00001374static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001375makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1376 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001377{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001378 *fmt++ = '%';
1379 if (width) {
1380 if (zeropad)
1381 *fmt++ = '0';
1382 fmt += sprintf(fmt, "%d", width);
1383 }
1384 if (precision)
1385 fmt += sprintf(fmt, ".%d", precision);
1386 if (longflag)
1387 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001388 else if (longlongflag) {
1389 /* longlongflag should only ever be nonzero on machines with
1390 HAVE_LONG_LONG defined */
1391#ifdef HAVE_LONG_LONG
1392 char *f = PY_FORMAT_LONG_LONG;
1393 while (*f)
1394 *fmt++ = *f++;
1395#else
1396 /* we shouldn't ever get here */
1397 assert(0);
1398 *fmt++ = 'l';
1399#endif
1400 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001401 else if (size_tflag) {
1402 char *f = PY_FORMAT_SIZE_T;
1403 while (*f)
1404 *fmt++ = *f++;
1405 }
1406 *fmt++ = c;
1407 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001408}
1409
Victor Stinner96865452011-03-01 23:44:09 +00001410/* helper for PyUnicode_FromFormatV() */
1411
1412static const char*
1413parse_format_flags(const char *f,
1414 int *p_width, int *p_precision,
1415 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1416{
1417 int width, precision, longflag, longlongflag, size_tflag;
1418
1419 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1420 f++;
1421 width = 0;
1422 while (Py_ISDIGIT((unsigned)*f))
1423 width = (width*10) + *f++ - '0';
1424 precision = 0;
1425 if (*f == '.') {
1426 f++;
1427 while (Py_ISDIGIT((unsigned)*f))
1428 precision = (precision*10) + *f++ - '0';
1429 if (*f == '%') {
1430 /* "%.3%s" => f points to "3" */
1431 f--;
1432 }
1433 }
1434 if (*f == '\0') {
1435 /* bogus format "%.1" => go backward, f points to "1" */
1436 f--;
1437 }
1438 if (p_width != NULL)
1439 *p_width = width;
1440 if (p_precision != NULL)
1441 *p_precision = precision;
1442
1443 /* Handle %ld, %lu, %lld and %llu. */
1444 longflag = 0;
1445 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001446 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001447
1448 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001449 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001450 longflag = 1;
1451 ++f;
1452 }
1453#ifdef HAVE_LONG_LONG
1454 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001455 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001456 longlongflag = 1;
1457 f += 2;
1458 }
1459#endif
1460 }
1461 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001462 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001463 size_tflag = 1;
1464 ++f;
1465 }
1466 if (p_longflag != NULL)
1467 *p_longflag = longflag;
1468 if (p_longlongflag != NULL)
1469 *p_longlongflag = longlongflag;
1470 if (p_size_tflag != NULL)
1471 *p_size_tflag = size_tflag;
1472 return f;
1473}
1474
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001475/* maximum number of characters required for output of %ld. 21 characters
1476 allows for 64-bit integers (in decimal) and an optional sign. */
1477#define MAX_LONG_CHARS 21
1478/* maximum number of characters required for output of %lld.
1479 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1480 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1481#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1482
Walter Dörwaldd2034312007-05-18 16:29:38 +00001483PyObject *
1484PyUnicode_FromFormatV(const char *format, va_list vargs)
1485{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001486 va_list count;
1487 Py_ssize_t callcount = 0;
1488 PyObject **callresults = NULL;
1489 PyObject **callresult = NULL;
1490 Py_ssize_t n = 0;
1491 int width = 0;
1492 int precision = 0;
1493 int zeropad;
1494 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001496 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001497 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1499 Py_UCS4 argmaxchar;
1500 Py_ssize_t numbersize = 0;
1501 char *numberresults = NULL;
1502 char *numberresult = NULL;
1503 Py_ssize_t i;
1504 int kind;
1505 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001506
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001507 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001508 /* step 1: count the number of %S/%R/%A/%s format specifications
1509 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1510 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 * result in an array)
1512 * also esimate a upper bound for all the number formats in the string,
1513 * numbers will be formated in step 3 and be keept in a '\0'-separated
1514 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001515 for (f = format; *f; f++) {
1516 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001517 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1519 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1520 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1521 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001524#ifdef HAVE_LONG_LONG
1525 if (longlongflag) {
1526 if (width < MAX_LONG_LONG_CHARS)
1527 width = MAX_LONG_LONG_CHARS;
1528 }
1529 else
1530#endif
1531 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1532 including sign. Decimal takes the most space. This
1533 isn't enough for octal. If a width is specified we
1534 need more (which we allocate later). */
1535 if (width < MAX_LONG_CHARS)
1536 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537
1538 /* account for the size + '\0' to separate numbers
1539 inside of the numberresults buffer */
1540 numbersize += (width + 1);
1541 }
1542 }
1543 else if ((unsigned char)*f > 127) {
1544 PyErr_Format(PyExc_ValueError,
1545 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1546 "string, got a non-ASCII byte: 0x%02x",
1547 (unsigned char)*f);
1548 return NULL;
1549 }
1550 }
1551 /* step 2: allocate memory for the results of
1552 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1553 if (callcount) {
1554 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1555 if (!callresults) {
1556 PyErr_NoMemory();
1557 return NULL;
1558 }
1559 callresult = callresults;
1560 }
1561 /* step 2.5: allocate memory for the results of formating numbers */
1562 if (numbersize) {
1563 numberresults = PyObject_Malloc(numbersize);
1564 if (!numberresults) {
1565 PyErr_NoMemory();
1566 goto fail;
1567 }
1568 numberresult = numberresults;
1569 }
1570
1571 /* step 3: format numbers and figure out how large a buffer we need */
1572 for (f = format; *f; f++) {
1573 if (*f == '%') {
1574 const char* p;
1575 int longflag;
1576 int longlongflag;
1577 int size_tflag;
1578 int numprinted;
1579
1580 p = f;
1581 zeropad = (f[1] == '0');
1582 f = parse_format_flags(f, &width, &precision,
1583 &longflag, &longlongflag, &size_tflag);
1584 switch (*f) {
1585 case 'c':
1586 {
1587 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001588 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 n++;
1590 break;
1591 }
1592 case '%':
1593 n++;
1594 break;
1595 case 'i':
1596 case 'd':
1597 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1598 width, precision, *f);
1599 if (longflag)
1600 numprinted = sprintf(numberresult, fmt,
1601 va_arg(count, long));
1602#ifdef HAVE_LONG_LONG
1603 else if (longlongflag)
1604 numprinted = sprintf(numberresult, fmt,
1605 va_arg(count, PY_LONG_LONG));
1606#endif
1607 else if (size_tflag)
1608 numprinted = sprintf(numberresult, fmt,
1609 va_arg(count, Py_ssize_t));
1610 else
1611 numprinted = sprintf(numberresult, fmt,
1612 va_arg(count, int));
1613 n += numprinted;
1614 /* advance by +1 to skip over the '\0' */
1615 numberresult += (numprinted + 1);
1616 assert(*(numberresult - 1) == '\0');
1617 assert(*(numberresult - 2) != '\0');
1618 assert(numprinted >= 0);
1619 assert(numberresult <= numberresults + numbersize);
1620 break;
1621 case 'u':
1622 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1623 width, precision, 'u');
1624 if (longflag)
1625 numprinted = sprintf(numberresult, fmt,
1626 va_arg(count, unsigned long));
1627#ifdef HAVE_LONG_LONG
1628 else if (longlongflag)
1629 numprinted = sprintf(numberresult, fmt,
1630 va_arg(count, unsigned PY_LONG_LONG));
1631#endif
1632 else if (size_tflag)
1633 numprinted = sprintf(numberresult, fmt,
1634 va_arg(count, size_t));
1635 else
1636 numprinted = sprintf(numberresult, fmt,
1637 va_arg(count, unsigned int));
1638 n += numprinted;
1639 numberresult += (numprinted + 1);
1640 assert(*(numberresult - 1) == '\0');
1641 assert(*(numberresult - 2) != '\0');
1642 assert(numprinted >= 0);
1643 assert(numberresult <= numberresults + numbersize);
1644 break;
1645 case 'x':
1646 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1647 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1648 n += numprinted;
1649 numberresult += (numprinted + 1);
1650 assert(*(numberresult - 1) == '\0');
1651 assert(*(numberresult - 2) != '\0');
1652 assert(numprinted >= 0);
1653 assert(numberresult <= numberresults + numbersize);
1654 break;
1655 case 'p':
1656 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1657 /* %p is ill-defined: ensure leading 0x. */
1658 if (numberresult[1] == 'X')
1659 numberresult[1] = 'x';
1660 else if (numberresult[1] != 'x') {
1661 memmove(numberresult + 2, numberresult,
1662 strlen(numberresult) + 1);
1663 numberresult[0] = '0';
1664 numberresult[1] = 'x';
1665 numprinted += 2;
1666 }
1667 n += numprinted;
1668 numberresult += (numprinted + 1);
1669 assert(*(numberresult - 1) == '\0');
1670 assert(*(numberresult - 2) != '\0');
1671 assert(numprinted >= 0);
1672 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001673 break;
1674 case 's':
1675 {
1676 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001677 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001678 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1679 if (!str)
1680 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 /* since PyUnicode_DecodeUTF8 returns already flexible
1682 unicode objects, there is no need to call ready on them */
1683 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001684 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001686 /* Remember the str and switch to the next slot */
1687 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001688 break;
1689 }
1690 case 'U':
1691 {
1692 PyObject *obj = va_arg(count, PyObject *);
1693 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 if (PyUnicode_READY(obj) == -1)
1695 goto fail;
1696 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001697 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001699 break;
1700 }
1701 case 'V':
1702 {
1703 PyObject *obj = va_arg(count, PyObject *);
1704 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001705 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001706 assert(obj || str);
1707 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001708 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 if (PyUnicode_READY(obj) == -1)
1710 goto fail;
1711 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001712 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001714 *callresult++ = NULL;
1715 }
1716 else {
1717 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1718 if (!str_obj)
1719 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001721 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001723 *callresult++ = str_obj;
1724 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001725 break;
1726 }
1727 case 'S':
1728 {
1729 PyObject *obj = va_arg(count, PyObject *);
1730 PyObject *str;
1731 assert(obj);
1732 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001734 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001736 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001738 /* Remember the str and switch to the next slot */
1739 *callresult++ = str;
1740 break;
1741 }
1742 case 'R':
1743 {
1744 PyObject *obj = va_arg(count, PyObject *);
1745 PyObject *repr;
1746 assert(obj);
1747 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001749 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001751 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001753 /* Remember the repr and switch to the next slot */
1754 *callresult++ = repr;
1755 break;
1756 }
1757 case 'A':
1758 {
1759 PyObject *obj = va_arg(count, PyObject *);
1760 PyObject *ascii;
1761 assert(obj);
1762 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001764 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001766 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001768 /* Remember the repr and switch to the next slot */
1769 *callresult++ = ascii;
1770 break;
1771 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001772 default:
1773 /* if we stumble upon an unknown
1774 formatting code, copy the rest of
1775 the format string to the output
1776 string. (we cannot just skip the
1777 code, since there's no way to know
1778 what's in the argument list) */
1779 n += strlen(p);
1780 goto expand;
1781 }
1782 } else
1783 n++;
1784 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001786 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001788 we don't have to resize the string.
1789 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001791 if (!string)
1792 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 kind = PyUnicode_KIND(string);
1794 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001795 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001799 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001800 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001801
1802 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1804 /* checking for == because the last argument could be a empty
1805 string, which causes i to point to end, the assert at the end of
1806 the loop */
1807 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001808
Benjamin Peterson14339b62009-01-31 16:36:08 +00001809 switch (*f) {
1810 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001811 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 const int ordinal = va_arg(vargs, int);
1813 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001814 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001815 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001816 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001817 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001818 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001819 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 case 'p':
1821 /* unused, since we already have the result */
1822 if (*f == 'p')
1823 (void) va_arg(vargs, void *);
1824 else
1825 (void) va_arg(vargs, int);
1826 /* extract the result from numberresults and append. */
1827 for (; *numberresult; ++i, ++numberresult)
1828 PyUnicode_WRITE(kind, data, i, *numberresult);
1829 /* skip over the separating '\0' */
1830 assert(*numberresult == '\0');
1831 numberresult++;
1832 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001833 break;
1834 case 's':
1835 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001836 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001838 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 size = PyUnicode_GET_LENGTH(*callresult);
1840 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001841 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1842 *callresult, 0,
1843 size) < 0)
1844 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001846 /* We're done with the unicode()/repr() => forget it */
1847 Py_DECREF(*callresult);
1848 /* switch to next unicode()/repr() result */
1849 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001850 break;
1851 }
1852 case 'U':
1853 {
1854 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 Py_ssize_t size;
1856 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1857 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001858 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1859 obj, 0,
1860 size) < 0)
1861 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001863 break;
1864 }
1865 case 'V':
1866 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001868 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001869 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001870 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871 size = PyUnicode_GET_LENGTH(obj);
1872 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001873 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1874 obj, 0,
1875 size) < 0)
1876 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001877 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001878 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879 size = PyUnicode_GET_LENGTH(*callresult);
1880 assert(PyUnicode_KIND(*callresult) <=
1881 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001882 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1883 *callresult,
1884 0, size) < 0)
1885 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001887 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001888 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001889 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001890 break;
1891 }
1892 case 'S':
1893 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001894 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001895 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001896 /* unused, since we already have the result */
1897 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001899 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1900 *callresult, 0,
1901 PyUnicode_GET_LENGTH(*callresult)) < 0)
1902 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001904 /* We're done with the unicode()/repr() => forget it */
1905 Py_DECREF(*callresult);
1906 /* switch to next unicode()/repr() result */
1907 ++callresult;
1908 break;
1909 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001910 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001912 break;
1913 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 for (; *p; ++p, ++i)
1915 PyUnicode_WRITE(kind, data, i, *p);
1916 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001917 goto end;
1918 }
Victor Stinner1205f272010-09-11 00:54:47 +00001919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920 else {
1921 assert(i < PyUnicode_GET_LENGTH(string));
1922 PyUnicode_WRITE(kind, data, i++, *f);
1923 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001925 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001926
Benjamin Peterson29060642009-01-31 22:14:21 +00001927 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001928 if (callresults)
1929 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 if (numberresults)
1931 PyObject_Free(numberresults);
1932 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001933 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001934 if (callresults) {
1935 PyObject **callresult2 = callresults;
1936 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001937 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001938 ++callresult2;
1939 }
1940 PyObject_Free(callresults);
1941 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001942 if (numberresults)
1943 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001944 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001945}
1946
Walter Dörwaldd2034312007-05-18 16:29:38 +00001947PyObject *
1948PyUnicode_FromFormat(const char *format, ...)
1949{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001950 PyObject* ret;
1951 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001952
1953#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001954 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001955#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001956 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001957#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001958 ret = PyUnicode_FromFormatV(format, vargs);
1959 va_end(vargs);
1960 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001961}
1962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963#ifdef HAVE_WCHAR_H
1964
Victor Stinner5593d8a2010-10-02 11:11:27 +00001965/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1966 convert a Unicode object to a wide character string.
1967
Victor Stinnerd88d9832011-09-06 02:00:05 +02001968 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001969 character) required to convert the unicode object. Ignore size argument.
1970
Victor Stinnerd88d9832011-09-06 02:00:05 +02001971 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001972 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001973 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001974static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001975unicode_aswidechar(PyUnicodeObject *unicode,
1976 wchar_t *w,
1977 Py_ssize_t size)
1978{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001979 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980 const wchar_t *wstr;
1981
1982 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1983 if (wstr == NULL)
1984 return -1;
1985
Victor Stinner5593d8a2010-10-02 11:11:27 +00001986 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001987 if (size > res)
1988 size = res + 1;
1989 else
1990 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001992 return res;
1993 }
1994 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001996}
1997
1998Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001999PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002000 wchar_t *w,
2001 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002002{
2003 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002004 PyErr_BadInternalCall();
2005 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002007 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008}
2009
Victor Stinner137c34c2010-09-29 10:25:54 +00002010wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002011PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002012 Py_ssize_t *size)
2013{
2014 wchar_t* buffer;
2015 Py_ssize_t buflen;
2016
2017 if (unicode == NULL) {
2018 PyErr_BadInternalCall();
2019 return NULL;
2020 }
2021
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002022 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 if (buflen == -1)
2024 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002025 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002026 PyErr_NoMemory();
2027 return NULL;
2028 }
2029
Victor Stinner137c34c2010-09-29 10:25:54 +00002030 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2031 if (buffer == NULL) {
2032 PyErr_NoMemory();
2033 return NULL;
2034 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002035 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 if (buflen == -1)
2037 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002038 if (size != NULL)
2039 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002040 return buffer;
2041}
2042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044
Alexander Belopolsky40018472011-02-26 01:02:56 +00002045PyObject *
2046PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002047{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002049 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002050 PyErr_SetString(PyExc_ValueError,
2051 "chr() arg not in range(0x110000)");
2052 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002053 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 if (ordinal < 256)
2056 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058 v = PyUnicode_New(1, ordinal);
2059 if (v == NULL)
2060 return NULL;
2061 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2062 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002063}
2064
Alexander Belopolsky40018472011-02-26 01:02:56 +00002065PyObject *
2066PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002068 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002069 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002070 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002071 Py_INCREF(obj);
2072 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002073 }
2074 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002075 /* For a Unicode subtype that's not a Unicode object,
2076 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002077 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002078 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002079 PyErr_Format(PyExc_TypeError,
2080 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002081 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002082 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002083}
2084
Alexander Belopolsky40018472011-02-26 01:02:56 +00002085PyObject *
2086PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002087 const char *encoding,
2088 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002089{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002090 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002091 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002092
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002094 PyErr_BadInternalCall();
2095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002097
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002098 /* Decoding bytes objects is the most common case and should be fast */
2099 if (PyBytes_Check(obj)) {
2100 if (PyBytes_GET_SIZE(obj) == 0) {
2101 Py_INCREF(unicode_empty);
2102 v = (PyObject *) unicode_empty;
2103 }
2104 else {
2105 v = PyUnicode_Decode(
2106 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2107 encoding, errors);
2108 }
2109 return v;
2110 }
2111
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002112 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002113 PyErr_SetString(PyExc_TypeError,
2114 "decoding str is not supported");
2115 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002116 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002117
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002118 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2119 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2120 PyErr_Format(PyExc_TypeError,
2121 "coercing to str: need bytes, bytearray "
2122 "or buffer-like object, %.80s found",
2123 Py_TYPE(obj)->tp_name);
2124 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002125 }
Tim Petersced69f82003-09-16 20:30:58 +00002126
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002127 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002128 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002129 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 }
Tim Petersced69f82003-09-16 20:30:58 +00002131 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002132 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002133
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002134 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002135 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136}
2137
Victor Stinner600d3be2010-06-10 12:00:55 +00002138/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002139 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2140 1 on success. */
2141static int
2142normalize_encoding(const char *encoding,
2143 char *lower,
2144 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002146 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002147 char *l;
2148 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002149
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002150 e = encoding;
2151 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002152 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002153 while (*e) {
2154 if (l == l_end)
2155 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002156 if (Py_ISUPPER(*e)) {
2157 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002158 }
2159 else if (*e == '_') {
2160 *l++ = '-';
2161 e++;
2162 }
2163 else {
2164 *l++ = *e++;
2165 }
2166 }
2167 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002168 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002169}
2170
Alexander Belopolsky40018472011-02-26 01:02:56 +00002171PyObject *
2172PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002173 Py_ssize_t size,
2174 const char *encoding,
2175 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002176{
2177 PyObject *buffer = NULL, *unicode;
2178 Py_buffer info;
2179 char lower[11]; /* Enough for any encoding shortcut */
2180
2181 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002182 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002183
2184 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002185 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002186 if ((strcmp(lower, "utf-8") == 0) ||
2187 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002188 return PyUnicode_DecodeUTF8(s, size, errors);
2189 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002190 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002191 (strcmp(lower, "iso-8859-1") == 0))
2192 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002193#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002194 else if (strcmp(lower, "mbcs") == 0)
2195 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002196#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002197 else if (strcmp(lower, "ascii") == 0)
2198 return PyUnicode_DecodeASCII(s, size, errors);
2199 else if (strcmp(lower, "utf-16") == 0)
2200 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2201 else if (strcmp(lower, "utf-32") == 0)
2202 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204
2205 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002206 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002207 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002208 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002209 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 if (buffer == NULL)
2211 goto onError;
2212 unicode = PyCodec_Decode(buffer, encoding, errors);
2213 if (unicode == NULL)
2214 goto onError;
2215 if (!PyUnicode_Check(unicode)) {
2216 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002217 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002218 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002219 Py_DECREF(unicode);
2220 goto onError;
2221 }
2222 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 if (PyUnicode_READY(unicode)) {
2224 Py_DECREF(unicode);
2225 return NULL;
2226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002228
Benjamin Peterson29060642009-01-31 22:14:21 +00002229 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 Py_XDECREF(buffer);
2231 return NULL;
2232}
2233
Alexander Belopolsky40018472011-02-26 01:02:56 +00002234PyObject *
2235PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002236 const char *encoding,
2237 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002238{
2239 PyObject *v;
2240
2241 if (!PyUnicode_Check(unicode)) {
2242 PyErr_BadArgument();
2243 goto onError;
2244 }
2245
2246 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002247 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002248
2249 /* Decode via the codec registry */
2250 v = PyCodec_Decode(unicode, encoding, errors);
2251 if (v == NULL)
2252 goto onError;
2253 return v;
2254
Benjamin Peterson29060642009-01-31 22:14:21 +00002255 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002256 return NULL;
2257}
2258
Alexander Belopolsky40018472011-02-26 01:02:56 +00002259PyObject *
2260PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002261 const char *encoding,
2262 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002263{
2264 PyObject *v;
2265
2266 if (!PyUnicode_Check(unicode)) {
2267 PyErr_BadArgument();
2268 goto onError;
2269 }
2270
2271 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002272 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002273
2274 /* Decode via the codec registry */
2275 v = PyCodec_Decode(unicode, encoding, errors);
2276 if (v == NULL)
2277 goto onError;
2278 if (!PyUnicode_Check(v)) {
2279 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002280 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002281 Py_TYPE(v)->tp_name);
2282 Py_DECREF(v);
2283 goto onError;
2284 }
2285 return v;
2286
Benjamin Peterson29060642009-01-31 22:14:21 +00002287 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002288 return NULL;
2289}
2290
Alexander Belopolsky40018472011-02-26 01:02:56 +00002291PyObject *
2292PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002293 Py_ssize_t size,
2294 const char *encoding,
2295 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296{
2297 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002298
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 unicode = PyUnicode_FromUnicode(s, size);
2300 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002301 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2303 Py_DECREF(unicode);
2304 return v;
2305}
2306
Alexander Belopolsky40018472011-02-26 01:02:56 +00002307PyObject *
2308PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002309 const char *encoding,
2310 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002311{
2312 PyObject *v;
2313
2314 if (!PyUnicode_Check(unicode)) {
2315 PyErr_BadArgument();
2316 goto onError;
2317 }
2318
2319 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002320 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002321
2322 /* Encode via the codec registry */
2323 v = PyCodec_Encode(unicode, encoding, errors);
2324 if (v == NULL)
2325 goto onError;
2326 return v;
2327
Benjamin Peterson29060642009-01-31 22:14:21 +00002328 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002329 return NULL;
2330}
2331
Victor Stinnerad158722010-10-27 00:25:46 +00002332PyObject *
2333PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002334{
Victor Stinner99b95382011-07-04 14:23:54 +02002335#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002336 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2337 PyUnicode_GET_SIZE(unicode),
2338 NULL);
2339#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002340 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002341#else
Victor Stinner793b5312011-04-27 00:24:21 +02002342 PyInterpreterState *interp = PyThreadState_GET()->interp;
2343 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2344 cannot use it to encode and decode filenames before it is loaded. Load
2345 the Python codec requires to encode at least its own filename. Use the C
2346 version of the locale codec until the codec registry is initialized and
2347 the Python codec is loaded.
2348
2349 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2350 cannot only rely on it: check also interp->fscodec_initialized for
2351 subinterpreters. */
2352 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002353 return PyUnicode_AsEncodedString(unicode,
2354 Py_FileSystemDefaultEncoding,
2355 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002356 }
2357 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002358 /* locale encoding with surrogateescape */
2359 wchar_t *wchar;
2360 char *bytes;
2361 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002362 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002363
2364 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2365 if (wchar == NULL)
2366 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002367 bytes = _Py_wchar2char(wchar, &error_pos);
2368 if (bytes == NULL) {
2369 if (error_pos != (size_t)-1) {
2370 char *errmsg = strerror(errno);
2371 PyObject *exc = NULL;
2372 if (errmsg == NULL)
2373 errmsg = "Py_wchar2char() failed";
2374 raise_encode_exception(&exc,
2375 "filesystemencoding",
2376 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2377 error_pos, error_pos+1,
2378 errmsg);
2379 Py_XDECREF(exc);
2380 }
2381 else
2382 PyErr_NoMemory();
2383 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002384 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002385 }
2386 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002387
2388 bytes_obj = PyBytes_FromString(bytes);
2389 PyMem_Free(bytes);
2390 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002391 }
Victor Stinnerad158722010-10-27 00:25:46 +00002392#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002393}
2394
Alexander Belopolsky40018472011-02-26 01:02:56 +00002395PyObject *
2396PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002397 const char *encoding,
2398 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399{
2400 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002401 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002402
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 if (!PyUnicode_Check(unicode)) {
2404 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406 }
Fred Drakee4315f52000-05-09 19:53:39 +00002407
Victor Stinner2f283c22011-03-02 01:21:46 +00002408 if (encoding == NULL) {
2409 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002411 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002413 }
Fred Drakee4315f52000-05-09 19:53:39 +00002414
2415 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002416 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002417 if ((strcmp(lower, "utf-8") == 0) ||
2418 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002419 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002420 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002422 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002424 }
Victor Stinner37296e82010-06-10 13:36:23 +00002425 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002426 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002427 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002429#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002430 else if (strcmp(lower, "mbcs") == 0)
2431 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2432 PyUnicode_GET_SIZE(unicode),
2433 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002434#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002435 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002436 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002438
2439 /* Encode via the codec registry */
2440 v = PyCodec_Encode(unicode, encoding, errors);
2441 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002442 return NULL;
2443
2444 /* The normal path */
2445 if (PyBytes_Check(v))
2446 return v;
2447
2448 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002449 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002450 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002451 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002452
2453 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2454 "encoder %s returned bytearray instead of bytes",
2455 encoding);
2456 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002457 Py_DECREF(v);
2458 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002459 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002460
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002461 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2462 Py_DECREF(v);
2463 return b;
2464 }
2465
2466 PyErr_Format(PyExc_TypeError,
2467 "encoder did not return a bytes object (type=%.400s)",
2468 Py_TYPE(v)->tp_name);
2469 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002470 return NULL;
2471}
2472
Alexander Belopolsky40018472011-02-26 01:02:56 +00002473PyObject *
2474PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002475 const char *encoding,
2476 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002477{
2478 PyObject *v;
2479
2480 if (!PyUnicode_Check(unicode)) {
2481 PyErr_BadArgument();
2482 goto onError;
2483 }
2484
2485 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002486 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002487
2488 /* Encode via the codec registry */
2489 v = PyCodec_Encode(unicode, encoding, errors);
2490 if (v == NULL)
2491 goto onError;
2492 if (!PyUnicode_Check(v)) {
2493 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002494 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002495 Py_TYPE(v)->tp_name);
2496 Py_DECREF(v);
2497 goto onError;
2498 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002500
Benjamin Peterson29060642009-01-31 22:14:21 +00002501 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502 return NULL;
2503}
2504
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002505PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002506PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002507 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002508 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2509}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002510
Christian Heimes5894ba72007-11-04 11:43:14 +00002511PyObject*
2512PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2513{
Victor Stinner99b95382011-07-04 14:23:54 +02002514#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002515 return PyUnicode_DecodeMBCS(s, size, NULL);
2516#elif defined(__APPLE__)
2517 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2518#else
Victor Stinner793b5312011-04-27 00:24:21 +02002519 PyInterpreterState *interp = PyThreadState_GET()->interp;
2520 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2521 cannot use it to encode and decode filenames before it is loaded. Load
2522 the Python codec requires to encode at least its own filename. Use the C
2523 version of the locale codec until the codec registry is initialized and
2524 the Python codec is loaded.
2525
2526 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2527 cannot only rely on it: check also interp->fscodec_initialized for
2528 subinterpreters. */
2529 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002530 return PyUnicode_Decode(s, size,
2531 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002532 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002533 }
2534 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002535 /* locale encoding with surrogateescape */
2536 wchar_t *wchar;
2537 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002538 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002539
2540 if (s[size] != '\0' || size != strlen(s)) {
2541 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2542 return NULL;
2543 }
2544
Victor Stinner168e1172010-10-16 23:16:16 +00002545 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002546 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002547 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002548
Victor Stinner168e1172010-10-16 23:16:16 +00002549 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002550 PyMem_Free(wchar);
2551 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002552 }
Victor Stinnerad158722010-10-27 00:25:46 +00002553#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002554}
2555
Martin v. Löwis011e8422009-05-05 04:43:17 +00002556
2557int
2558PyUnicode_FSConverter(PyObject* arg, void* addr)
2559{
2560 PyObject *output = NULL;
2561 Py_ssize_t size;
2562 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002563 if (arg == NULL) {
2564 Py_DECREF(*(PyObject**)addr);
2565 return 1;
2566 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002567 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002568 output = arg;
2569 Py_INCREF(output);
2570 }
2571 else {
2572 arg = PyUnicode_FromObject(arg);
2573 if (!arg)
2574 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002575 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002576 Py_DECREF(arg);
2577 if (!output)
2578 return 0;
2579 if (!PyBytes_Check(output)) {
2580 Py_DECREF(output);
2581 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2582 return 0;
2583 }
2584 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002585 size = PyBytes_GET_SIZE(output);
2586 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002587 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002588 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002589 Py_DECREF(output);
2590 return 0;
2591 }
2592 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002593 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002594}
2595
2596
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002597int
2598PyUnicode_FSDecoder(PyObject* arg, void* addr)
2599{
2600 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002601 if (arg == NULL) {
2602 Py_DECREF(*(PyObject**)addr);
2603 return 1;
2604 }
2605 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002606 if (PyUnicode_READY(arg))
2607 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002608 output = arg;
2609 Py_INCREF(output);
2610 }
2611 else {
2612 arg = PyBytes_FromObject(arg);
2613 if (!arg)
2614 return 0;
2615 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2616 PyBytes_GET_SIZE(arg));
2617 Py_DECREF(arg);
2618 if (!output)
2619 return 0;
2620 if (!PyUnicode_Check(output)) {
2621 Py_DECREF(output);
2622 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2623 return 0;
2624 }
2625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2627 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002628 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2629 Py_DECREF(output);
2630 return 0;
2631 }
2632 *(PyObject**)addr = output;
2633 return Py_CLEANUP_SUPPORTED;
2634}
2635
2636
Martin v. Löwis5b222132007-06-10 09:51:05 +00002637char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002639{
Christian Heimesf3863112007-11-22 07:46:41 +00002640 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2642
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002643 if (!PyUnicode_Check(unicode)) {
2644 PyErr_BadArgument();
2645 return NULL;
2646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002647 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002648 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002649
2650 if (_PyUnicode_UTF8(unicode) == NULL) {
2651 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2652 if (bytes == NULL)
2653 return NULL;
2654 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2655 if (u->_base.utf8 == NULL) {
2656 Py_DECREF(bytes);
2657 return NULL;
2658 }
2659 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2660 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2661 Py_DECREF(bytes);
2662 }
2663
2664 if (psize)
2665 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2666 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002667}
2668
2669char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002670PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002671{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2673}
2674
2675#ifdef Py_DEBUG
2676int unicode_as_unicode_calls = 0;
2677#endif
2678
2679
2680Py_UNICODE *
2681PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2682{
2683 PyUnicodeObject *u;
2684 const unsigned char *one_byte;
2685#if SIZEOF_WCHAR_T == 4
2686 const Py_UCS2 *two_bytes;
2687#else
2688 const Py_UCS4 *four_bytes;
2689 const Py_UCS4 *ucs4_end;
2690 Py_ssize_t num_surrogates;
2691#endif
2692 wchar_t *w;
2693 wchar_t *wchar_end;
2694
2695 if (!PyUnicode_Check(unicode)) {
2696 PyErr_BadArgument();
2697 return NULL;
2698 }
2699 u = (PyUnicodeObject*)unicode;
2700 if (_PyUnicode_WSTR(u) == NULL) {
2701 /* Non-ASCII compact unicode object */
2702 assert(_PyUnicode_KIND(u) != 0);
2703 assert(PyUnicode_IS_READY(u));
2704
2705#ifdef Py_DEBUG
2706 ++unicode_as_unicode_calls;
2707#endif
2708
2709 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2710#if SIZEOF_WCHAR_T == 2
2711 four_bytes = PyUnicode_4BYTE_DATA(u);
2712 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2713 num_surrogates = 0;
2714
2715 for (; four_bytes < ucs4_end; ++four_bytes) {
2716 if (*four_bytes > 0xFFFF)
2717 ++num_surrogates;
2718 }
2719
2720 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2721 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2722 if (!_PyUnicode_WSTR(u)) {
2723 PyErr_NoMemory();
2724 return NULL;
2725 }
2726 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2727
2728 w = _PyUnicode_WSTR(u);
2729 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2730 four_bytes = PyUnicode_4BYTE_DATA(u);
2731 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2732 if (*four_bytes > 0xFFFF) {
2733 /* encode surrogate pair in this case */
2734 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2735 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2736 }
2737 else
2738 *w = *four_bytes;
2739
2740 if (w > wchar_end) {
2741 assert(0 && "Miscalculated string end");
2742 }
2743 }
2744 *w = 0;
2745#else
2746 /* sizeof(wchar_t) == 4 */
2747 Py_FatalError("Impossible unicode object state, wstr and str "
2748 "should share memory already.");
2749 return NULL;
2750#endif
2751 }
2752 else {
2753 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2754 (_PyUnicode_LENGTH(u) + 1));
2755 if (!_PyUnicode_WSTR(u)) {
2756 PyErr_NoMemory();
2757 return NULL;
2758 }
2759 if (!PyUnicode_IS_COMPACT_ASCII(u))
2760 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2761 w = _PyUnicode_WSTR(u);
2762 wchar_end = w + _PyUnicode_LENGTH(u);
2763
2764 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2765 one_byte = PyUnicode_1BYTE_DATA(u);
2766 for (; w < wchar_end; ++one_byte, ++w)
2767 *w = *one_byte;
2768 /* null-terminate the wstr */
2769 *w = 0;
2770 }
2771 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2772#if SIZEOF_WCHAR_T == 4
2773 two_bytes = PyUnicode_2BYTE_DATA(u);
2774 for (; w < wchar_end; ++two_bytes, ++w)
2775 *w = *two_bytes;
2776 /* null-terminate the wstr */
2777 *w = 0;
2778#else
2779 /* sizeof(wchar_t) == 2 */
2780 PyObject_FREE(_PyUnicode_WSTR(u));
2781 _PyUnicode_WSTR(u) = NULL;
2782 Py_FatalError("Impossible unicode object state, wstr "
2783 "and str should share memory already.");
2784 return NULL;
2785#endif
2786 }
2787 else {
2788 assert(0 && "This should never happen.");
2789 }
2790 }
2791 }
2792 if (size != NULL)
2793 *size = PyUnicode_WSTR_LENGTH(u);
2794 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002795}
2796
Alexander Belopolsky40018472011-02-26 01:02:56 +00002797Py_UNICODE *
2798PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002800 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801}
2802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002803
Alexander Belopolsky40018472011-02-26 01:02:56 +00002804Py_ssize_t
2805PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806{
2807 if (!PyUnicode_Check(unicode)) {
2808 PyErr_BadArgument();
2809 goto onError;
2810 }
2811 return PyUnicode_GET_SIZE(unicode);
2812
Benjamin Peterson29060642009-01-31 22:14:21 +00002813 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814 return -1;
2815}
2816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002817Py_ssize_t
2818PyUnicode_GetLength(PyObject *unicode)
2819{
2820 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2821 PyErr_BadArgument();
2822 return -1;
2823 }
2824
2825 return PyUnicode_GET_LENGTH(unicode);
2826}
2827
2828Py_UCS4
2829PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2830{
2831 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2832 return PyErr_BadArgument();
2833 return (Py_UCS4)-1;
2834 }
2835 return PyUnicode_READ_CHAR(unicode, index);
2836}
2837
2838int
2839PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2840{
2841 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2842 return PyErr_BadArgument();
2843 return -1;
2844 }
2845
2846 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2847 index, ch);
2848 return 0;
2849}
2850
Alexander Belopolsky40018472011-02-26 01:02:56 +00002851const char *
2852PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002853{
Victor Stinner42cb4622010-09-01 19:39:01 +00002854 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002855}
2856
Victor Stinner554f3f02010-06-16 23:33:54 +00002857/* create or adjust a UnicodeDecodeError */
2858static void
2859make_decode_exception(PyObject **exceptionObject,
2860 const char *encoding,
2861 const char *input, Py_ssize_t length,
2862 Py_ssize_t startpos, Py_ssize_t endpos,
2863 const char *reason)
2864{
2865 if (*exceptionObject == NULL) {
2866 *exceptionObject = PyUnicodeDecodeError_Create(
2867 encoding, input, length, startpos, endpos, reason);
2868 }
2869 else {
2870 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2871 goto onError;
2872 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2873 goto onError;
2874 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2875 goto onError;
2876 }
2877 return;
2878
2879onError:
2880 Py_DECREF(*exceptionObject);
2881 *exceptionObject = NULL;
2882}
2883
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002884/* error handling callback helper:
2885 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002886 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887 and adjust various state variables.
2888 return 0 on success, -1 on error
2889*/
2890
Alexander Belopolsky40018472011-02-26 01:02:56 +00002891static int
2892unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002893 const char *encoding, const char *reason,
2894 const char **input, const char **inend, Py_ssize_t *startinpos,
2895 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2896 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002897{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002898 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002899
2900 PyObject *restuple = NULL;
2901 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002902 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002903 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002904 Py_ssize_t requiredsize;
2905 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002906 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002907 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002908 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002909 int res = -1;
2910
2911 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002912 *errorHandler = PyCodec_LookupError(errors);
2913 if (*errorHandler == NULL)
2914 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002915 }
2916
Victor Stinner554f3f02010-06-16 23:33:54 +00002917 make_decode_exception(exceptionObject,
2918 encoding,
2919 *input, *inend - *input,
2920 *startinpos, *endinpos,
2921 reason);
2922 if (*exceptionObject == NULL)
2923 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002924
2925 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2926 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002927 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002928 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002929 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002930 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002931 }
2932 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002933 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002934
2935 /* Copy back the bytes variables, which might have been modified by the
2936 callback */
2937 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2938 if (!inputobj)
2939 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002940 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002941 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002942 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002943 *input = PyBytes_AS_STRING(inputobj);
2944 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002945 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002946 /* we can DECREF safely, as the exception has another reference,
2947 so the object won't go away. */
2948 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002949
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002950 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002952 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002953 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2954 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002956
2957 /* need more space? (at least enough for what we
2958 have+the replacement+the rest of the string (starting
2959 at the new input position), so we won't have to check space
2960 when there are no errors in the rest of the string) */
2961 repptr = PyUnicode_AS_UNICODE(repunicode);
2962 repsize = PyUnicode_GET_SIZE(repunicode);
2963 requiredsize = *outpos + repsize + insize-newpos;
2964 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002965 if (requiredsize<2*outsize)
2966 requiredsize = 2*outsize;
2967 if (_PyUnicode_Resize(output, requiredsize) < 0)
2968 goto onError;
2969 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002970 }
2971 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002972 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002973 Py_UNICODE_COPY(*outptr, repptr, repsize);
2974 *outptr += repsize;
2975 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002976
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002977 /* we made it! */
2978 res = 0;
2979
Benjamin Peterson29060642009-01-31 22:14:21 +00002980 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002981 Py_XDECREF(restuple);
2982 return res;
2983}
2984
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002985/* --- UTF-7 Codec -------------------------------------------------------- */
2986
Antoine Pitrou244651a2009-05-04 18:56:13 +00002987/* See RFC2152 for details. We encode conservatively and decode liberally. */
2988
2989/* Three simple macros defining base-64. */
2990
2991/* Is c a base-64 character? */
2992
2993#define IS_BASE64(c) \
2994 (((c) >= 'A' && (c) <= 'Z') || \
2995 ((c) >= 'a' && (c) <= 'z') || \
2996 ((c) >= '0' && (c) <= '9') || \
2997 (c) == '+' || (c) == '/')
2998
2999/* given that c is a base-64 character, what is its base-64 value? */
3000
3001#define FROM_BASE64(c) \
3002 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3003 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3004 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3005 (c) == '+' ? 62 : 63)
3006
3007/* What is the base-64 character of the bottom 6 bits of n? */
3008
3009#define TO_BASE64(n) \
3010 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3011
3012/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3013 * decoded as itself. We are permissive on decoding; the only ASCII
3014 * byte not decoding to itself is the + which begins a base64
3015 * string. */
3016
3017#define DECODE_DIRECT(c) \
3018 ((c) <= 127 && (c) != '+')
3019
3020/* The UTF-7 encoder treats ASCII characters differently according to
3021 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3022 * the above). See RFC2152. This array identifies these different
3023 * sets:
3024 * 0 : "Set D"
3025 * alphanumeric and '(),-./:?
3026 * 1 : "Set O"
3027 * !"#$%&*;<=>@[]^_`{|}
3028 * 2 : "whitespace"
3029 * ht nl cr sp
3030 * 3 : special (must be base64 encoded)
3031 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3032 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003033
Tim Petersced69f82003-09-16 20:30:58 +00003034static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003035char utf7_category[128] = {
3036/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3037 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3038/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3039 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3040/* sp ! " # $ % & ' ( ) * + , - . / */
3041 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3042/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3043 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3044/* @ A B C D E F G H I J K L M N O */
3045 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3046/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3047 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3048/* ` a b c d e f g h i j k l m n o */
3049 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3050/* p q r s t u v w x y z { | } ~ del */
3051 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003052};
3053
Antoine Pitrou244651a2009-05-04 18:56:13 +00003054/* ENCODE_DIRECT: this character should be encoded as itself. The
3055 * answer depends on whether we are encoding set O as itself, and also
3056 * on whether we are encoding whitespace as itself. RFC2152 makes it
3057 * clear that the answers to these questions vary between
3058 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003059
Antoine Pitrou244651a2009-05-04 18:56:13 +00003060#define ENCODE_DIRECT(c, directO, directWS) \
3061 ((c) < 128 && (c) > 0 && \
3062 ((utf7_category[(c)] == 0) || \
3063 (directWS && (utf7_category[(c)] == 2)) || \
3064 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003065
Alexander Belopolsky40018472011-02-26 01:02:56 +00003066PyObject *
3067PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003068 Py_ssize_t size,
3069 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003070{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003071 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3072}
3073
Antoine Pitrou244651a2009-05-04 18:56:13 +00003074/* The decoder. The only state we preserve is our read position,
3075 * i.e. how many characters we have consumed. So if we end in the
3076 * middle of a shift sequence we have to back off the read position
3077 * and the output to the beginning of the sequence, otherwise we lose
3078 * all the shift state (seen bits, number of bits seen, high
3079 * surrogate). */
3080
Alexander Belopolsky40018472011-02-26 01:02:56 +00003081PyObject *
3082PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003083 Py_ssize_t size,
3084 const char *errors,
3085 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003086{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003087 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003088 Py_ssize_t startinpos;
3089 Py_ssize_t endinpos;
3090 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003091 const char *e;
3092 PyUnicodeObject *unicode;
3093 Py_UNICODE *p;
3094 const char *errmsg = "";
3095 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003096 Py_UNICODE *shiftOutStart;
3097 unsigned int base64bits = 0;
3098 unsigned long base64buffer = 0;
3099 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003100 PyObject *errorHandler = NULL;
3101 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003102
3103 unicode = _PyUnicode_New(size);
3104 if (!unicode)
3105 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003106 if (size == 0) {
3107 if (consumed)
3108 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003109 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003110 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003112 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003113 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003114 e = s + size;
3115
3116 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003117 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003118 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003119 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003120
Antoine Pitrou244651a2009-05-04 18:56:13 +00003121 if (inShift) { /* in a base-64 section */
3122 if (IS_BASE64(ch)) { /* consume a base-64 character */
3123 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3124 base64bits += 6;
3125 s++;
3126 if (base64bits >= 16) {
3127 /* we have enough bits for a UTF-16 value */
3128 Py_UNICODE outCh = (Py_UNICODE)
3129 (base64buffer >> (base64bits-16));
3130 base64bits -= 16;
3131 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3132 if (surrogate) {
3133 /* expecting a second surrogate */
3134 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3135#ifdef Py_UNICODE_WIDE
3136 *p++ = (((surrogate & 0x3FF)<<10)
3137 | (outCh & 0x3FF)) + 0x10000;
3138#else
3139 *p++ = surrogate;
3140 *p++ = outCh;
3141#endif
3142 surrogate = 0;
3143 }
3144 else {
3145 surrogate = 0;
3146 errmsg = "second surrogate missing";
3147 goto utf7Error;
3148 }
3149 }
3150 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3151 /* first surrogate */
3152 surrogate = outCh;
3153 }
3154 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3155 errmsg = "unexpected second surrogate";
3156 goto utf7Error;
3157 }
3158 else {
3159 *p++ = outCh;
3160 }
3161 }
3162 }
3163 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003164 inShift = 0;
3165 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003166 if (surrogate) {
3167 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003168 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003169 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003170 if (base64bits > 0) { /* left-over bits */
3171 if (base64bits >= 6) {
3172 /* We've seen at least one base-64 character */
3173 errmsg = "partial character in shift sequence";
3174 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003175 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003176 else {
3177 /* Some bits remain; they should be zero */
3178 if (base64buffer != 0) {
3179 errmsg = "non-zero padding bits in shift sequence";
3180 goto utf7Error;
3181 }
3182 }
3183 }
3184 if (ch != '-') {
3185 /* '-' is absorbed; other terminating
3186 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003187 *p++ = ch;
3188 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003189 }
3190 }
3191 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003192 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003193 s++; /* consume '+' */
3194 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003195 s++;
3196 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003197 }
3198 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003199 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003200 shiftOutStart = p;
3201 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003202 }
3203 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003204 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003205 *p++ = ch;
3206 s++;
3207 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003208 else {
3209 startinpos = s-starts;
3210 s++;
3211 errmsg = "unexpected special character";
3212 goto utf7Error;
3213 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003214 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003215utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 outpos = p-PyUnicode_AS_UNICODE(unicode);
3217 endinpos = s-starts;
3218 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 errors, &errorHandler,
3220 "utf7", errmsg,
3221 &starts, &e, &startinpos, &endinpos, &exc, &s,
3222 &unicode, &outpos, &p))
3223 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003224 }
3225
Antoine Pitrou244651a2009-05-04 18:56:13 +00003226 /* end of string */
3227
3228 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3229 /* if we're in an inconsistent state, that's an error */
3230 if (surrogate ||
3231 (base64bits >= 6) ||
3232 (base64bits > 0 && base64buffer != 0)) {
3233 outpos = p-PyUnicode_AS_UNICODE(unicode);
3234 endinpos = size;
3235 if (unicode_decode_call_errorhandler(
3236 errors, &errorHandler,
3237 "utf7", "unterminated shift sequence",
3238 &starts, &e, &startinpos, &endinpos, &exc, &s,
3239 &unicode, &outpos, &p))
3240 goto onError;
3241 if (s < e)
3242 goto restart;
3243 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003244 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003245
3246 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003247 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003248 if (inShift) {
3249 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003250 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003251 }
3252 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003253 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003254 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003255 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003256
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003257 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003258 goto onError;
3259
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003260 Py_XDECREF(errorHandler);
3261 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003262 if (PyUnicode_READY(unicode) == -1) {
3263 Py_DECREF(unicode);
3264 return NULL;
3265 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003266 return (PyObject *)unicode;
3267
Benjamin Peterson29060642009-01-31 22:14:21 +00003268 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003269 Py_XDECREF(errorHandler);
3270 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003271 Py_DECREF(unicode);
3272 return NULL;
3273}
3274
3275
Alexander Belopolsky40018472011-02-26 01:02:56 +00003276PyObject *
3277PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003278 Py_ssize_t size,
3279 int base64SetO,
3280 int base64WhiteSpace,
3281 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003282{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003283 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003284 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003285 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003286 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003287 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003288 unsigned int base64bits = 0;
3289 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003290 char * out;
3291 char * start;
3292
3293 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003294 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003295
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003296 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003297 return PyErr_NoMemory();
3298
Antoine Pitrou244651a2009-05-04 18:56:13 +00003299 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003300 if (v == NULL)
3301 return NULL;
3302
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003303 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003304 for (;i < size; ++i) {
3305 Py_UNICODE ch = s[i];
3306
Antoine Pitrou244651a2009-05-04 18:56:13 +00003307 if (inShift) {
3308 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3309 /* shifting out */
3310 if (base64bits) { /* output remaining bits */
3311 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3312 base64buffer = 0;
3313 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003314 }
3315 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003316 /* Characters not in the BASE64 set implicitly unshift the sequence
3317 so no '-' is required, except if the character is itself a '-' */
3318 if (IS_BASE64(ch) || ch == '-') {
3319 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003320 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003321 *out++ = (char) ch;
3322 }
3323 else {
3324 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003325 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003326 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003327 else { /* not in a shift sequence */
3328 if (ch == '+') {
3329 *out++ = '+';
3330 *out++ = '-';
3331 }
3332 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3333 *out++ = (char) ch;
3334 }
3335 else {
3336 *out++ = '+';
3337 inShift = 1;
3338 goto encode_char;
3339 }
3340 }
3341 continue;
3342encode_char:
3343#ifdef Py_UNICODE_WIDE
3344 if (ch >= 0x10000) {
3345 /* code first surrogate */
3346 base64bits += 16;
3347 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3348 while (base64bits >= 6) {
3349 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3350 base64bits -= 6;
3351 }
3352 /* prepare second surrogate */
3353 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3354 }
3355#endif
3356 base64bits += 16;
3357 base64buffer = (base64buffer << 16) | ch;
3358 while (base64bits >= 6) {
3359 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3360 base64bits -= 6;
3361 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003362 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003363 if (base64bits)
3364 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3365 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003366 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003367 if (_PyBytes_Resize(&v, out - start) < 0)
3368 return NULL;
3369 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003370}
3371
Antoine Pitrou244651a2009-05-04 18:56:13 +00003372#undef IS_BASE64
3373#undef FROM_BASE64
3374#undef TO_BASE64
3375#undef DECODE_DIRECT
3376#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003377
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378/* --- UTF-8 Codec -------------------------------------------------------- */
3379
Tim Petersced69f82003-09-16 20:30:58 +00003380static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003382 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3383 illegal prefix. See RFC 3629 for details */
3384 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3385 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003386 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003387 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3388 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3389 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3390 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003391 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3392 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3394 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003395 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3396 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3397 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3398 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3399 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400};
3401
Alexander Belopolsky40018472011-02-26 01:02:56 +00003402PyObject *
3403PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003404 Py_ssize_t size,
3405 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406{
Walter Dörwald69652032004-09-07 20:24:22 +00003407 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3408}
3409
Antoine Pitrouab868312009-01-10 15:40:25 +00003410/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3411#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3412
3413/* Mask to quickly check whether a C 'long' contains a
3414 non-ASCII, UTF8-encoded char. */
3415#if (SIZEOF_LONG == 8)
3416# define ASCII_CHAR_MASK 0x8080808080808080L
3417#elif (SIZEOF_LONG == 4)
3418# define ASCII_CHAR_MASK 0x80808080L
3419#else
3420# error C 'long' size should be either 4 or 8!
3421#endif
3422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003423/* Scans a UTF-8 string and returns the maximum character to be expected,
3424 the size of the decoded unicode string and if any major errors were
3425 encountered.
3426
3427 This function does check basic UTF-8 sanity, it does however NOT CHECK
3428 if the string contains surrogates, and if all continuation bytes are
3429 within the correct ranges, these checks are performed in
3430 PyUnicode_DecodeUTF8Stateful.
3431
3432 If it sets has_errors to 1, it means the value of unicode_size and max_char
3433 will be bogus and you should not rely on useful information in them.
3434 */
3435static Py_UCS4
3436utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3437 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3438 int *has_errors)
3439{
3440 Py_ssize_t n;
3441 Py_ssize_t char_count = 0;
3442 Py_UCS4 max_char = 127, new_max;
3443 Py_UCS4 upper_bound;
3444 const unsigned char *p = (const unsigned char *)s;
3445 const unsigned char *end = p + string_size;
3446 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3447 int err = 0;
3448
3449 for (; p < end && !err; ++p, ++char_count) {
3450 /* Only check value if it's not a ASCII char... */
3451 if (*p < 0x80) {
3452 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3453 an explanation. */
3454 if (!((size_t) p & LONG_PTR_MASK)) {
3455 /* Help register allocation */
3456 register const unsigned char *_p = p;
3457 while (_p < aligned_end) {
3458 unsigned long value = *(unsigned long *) _p;
3459 if (value & ASCII_CHAR_MASK)
3460 break;
3461 _p += SIZEOF_LONG;
3462 char_count += SIZEOF_LONG;
3463 }
3464 p = _p;
3465 if (p == end)
3466 break;
3467 }
3468 }
3469 if (*p >= 0x80) {
3470 n = utf8_code_length[*p];
3471 new_max = max_char;
3472 switch (n) {
3473 /* invalid start byte */
3474 case 0:
3475 err = 1;
3476 break;
3477 case 2:
3478 /* Code points between 0x00FF and 0x07FF inclusive.
3479 Approximate the upper bound of the code point,
3480 if this flips over 255 we can be sure it will be more
3481 than 255 and the string will need 2 bytes per code coint,
3482 if it stays under or equal to 255, we can be sure 1 byte
3483 is enough.
3484 ((*p & 0b00011111) << 6) | 0b00111111 */
3485 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3486 if (max_char < upper_bound)
3487 new_max = upper_bound;
3488 /* Ensure we track at least that we left ASCII space. */
3489 if (new_max < 128)
3490 new_max = 128;
3491 break;
3492 case 3:
3493 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3494 always > 255 and <= 65535 and will always need 2 bytes. */
3495 if (max_char < 65535)
3496 new_max = 65535;
3497 break;
3498 case 4:
3499 /* Code point will be above 0xFFFF for sure in this case. */
3500 new_max = 65537;
3501 break;
3502 /* Internal error, this should be caught by the first if */
3503 case 1:
3504 default:
3505 assert(0 && "Impossible case in utf8_max_char_and_size");
3506 err = 1;
3507 }
3508 /* Instead of number of overall bytes for this code point,
3509 n containts the number of following bytes: */
3510 --n;
3511 /* Check if the follow up chars are all valid continuation bytes */
3512 if (n >= 1) {
3513 const unsigned char *cont;
3514 if ((p + n) >= end) {
3515 if (consumed == 0)
3516 /* incomplete data, non-incremental decoding */
3517 err = 1;
3518 break;
3519 }
3520 for (cont = p + 1; cont < (p + n); ++cont) {
3521 if ((*cont & 0xc0) != 0x80) {
3522 err = 1;
3523 break;
3524 }
3525 }
3526 p += n;
3527 }
3528 else
3529 err = 1;
3530 max_char = new_max;
3531 }
3532 }
3533
3534 if (unicode_size)
3535 *unicode_size = char_count;
3536 if (has_errors)
3537 *has_errors = err;
3538 return max_char;
3539}
3540
3541/* Similar to PyUnicode_WRITE but can also write into wstr field
3542 of the legacy unicode representation */
3543#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3544 do { \
3545 const int k_ = (kind); \
3546 if (k_ == PyUnicode_WCHAR_KIND) \
3547 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3548 else if (k_ == PyUnicode_1BYTE_KIND) \
3549 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3550 else if (k_ == PyUnicode_2BYTE_KIND) \
3551 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3552 else \
3553 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3554 } while (0)
3555
Alexander Belopolsky40018472011-02-26 01:02:56 +00003556PyObject *
3557PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003558 Py_ssize_t size,
3559 const char *errors,
3560 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003561{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003564 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003565 Py_ssize_t startinpos;
3566 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003567 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003569 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 PyObject *errorHandler = NULL;
3571 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003572 Py_UCS4 maxchar = 0;
3573 Py_ssize_t unicode_size;
3574 Py_ssize_t i;
3575 int kind;
3576 void *data;
3577 int has_errors;
3578 Py_UNICODE *error_outptr;
3579#if SIZEOF_WCHAR_T == 2
3580 Py_ssize_t wchar_offset = 0;
3581#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582
Walter Dörwald69652032004-09-07 20:24:22 +00003583 if (size == 0) {
3584 if (consumed)
3585 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003586 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003588 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3589 consumed, &has_errors);
3590 if (has_errors) {
3591 unicode = _PyUnicode_New(size);
3592 if (!unicode)
3593 return NULL;
3594 kind = PyUnicode_WCHAR_KIND;
3595 data = PyUnicode_AS_UNICODE(unicode);
3596 assert(data != NULL);
3597 }
3598 else {
3599 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3600 if (!unicode)
3601 return NULL;
3602 /* When the string is ASCII only, just use memcpy and return.
3603 unicode_size may be != size if there is an incomplete UTF-8
3604 sequence at the end of the ASCII block. */
3605 if (maxchar < 128 && size == unicode_size) {
3606 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3607 return (PyObject *)unicode;
3608 }
3609 kind = PyUnicode_KIND(unicode);
3610 data = PyUnicode_DATA(unicode);
3611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003613 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003615 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616
3617 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003618 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619
3620 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003621 /* Fast path for runs of ASCII characters. Given that common UTF-8
3622 input will consist of an overwhelming majority of ASCII
3623 characters, we try to optimize for this case by checking
3624 as many characters as a C 'long' can contain.
3625 First, check if we can do an aligned read, as most CPUs have
3626 a penalty for unaligned reads.
3627 */
3628 if (!((size_t) s & LONG_PTR_MASK)) {
3629 /* Help register allocation */
3630 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003631 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003632 while (_s < aligned_end) {
3633 /* Read a whole long at a time (either 4 or 8 bytes),
3634 and do a fast unrolled copy if it only contains ASCII
3635 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003636 unsigned long value = *(unsigned long *) _s;
3637 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003638 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003639 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3640 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3641 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3642 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003643#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003644 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3645 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3646 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3647 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003648#endif
3649 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003650 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003651 }
3652 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003653 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003654 if (s == e)
3655 break;
3656 ch = (unsigned char)*s;
3657 }
3658 }
3659
3660 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003661 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 s++;
3663 continue;
3664 }
3665
3666 n = utf8_code_length[ch];
3667
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003668 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003669 if (consumed)
3670 break;
3671 else {
3672 errmsg = "unexpected end of data";
3673 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003674 endinpos = startinpos+1;
3675 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3676 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 goto utf8Error;
3678 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680
3681 switch (n) {
3682
3683 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003684 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003685 startinpos = s-starts;
3686 endinpos = startinpos+1;
3687 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688
3689 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003690 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003691 startinpos = s-starts;
3692 endinpos = startinpos+1;
3693 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694
3695 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003696 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003697 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003699 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003700 goto utf8Error;
3701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003703 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003704 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 break;
3706
3707 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003708 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3709 will result in surrogates in range d800-dfff. Surrogates are
3710 not valid UTF-8 so they are rejected.
3711 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3712 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003713 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003714 (s[2] & 0xc0) != 0x80 ||
3715 ((unsigned char)s[0] == 0xE0 &&
3716 (unsigned char)s[1] < 0xA0) ||
3717 ((unsigned char)s[0] == 0xED &&
3718 (unsigned char)s[1] > 0x9F)) {
3719 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003720 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003721 endinpos = startinpos + 1;
3722
3723 /* if s[1] first two bits are 1 and 0, then the invalid
3724 continuation byte is s[2], so increment endinpos by 1,
3725 if not, s[1] is invalid and endinpos doesn't need to
3726 be incremented. */
3727 if ((s[1] & 0xC0) == 0x80)
3728 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003729 goto utf8Error;
3730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003732 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003733 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003734 break;
3735
3736 case 4:
3737 if ((s[1] & 0xc0) != 0x80 ||
3738 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003739 (s[3] & 0xc0) != 0x80 ||
3740 ((unsigned char)s[0] == 0xF0 &&
3741 (unsigned char)s[1] < 0x90) ||
3742 ((unsigned char)s[0] == 0xF4 &&
3743 (unsigned char)s[1] > 0x8F)) {
3744 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003745 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003746 endinpos = startinpos + 1;
3747 if ((s[1] & 0xC0) == 0x80) {
3748 endinpos++;
3749 if ((s[2] & 0xC0) == 0x80)
3750 endinpos++;
3751 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003752 goto utf8Error;
3753 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003754 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003755 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3756 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758 /* If the string is flexible or we have native UCS-4, write
3759 directly.. */
3760 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3761 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003763 else {
3764 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766 /* translate from 10000..10FFFF to 0..FFFF */
3767 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 /* high surrogate = top 10 bits added to D800 */
3770 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3771 (Py_UNICODE)(0xD800 + (ch >> 10)));
3772
3773 /* low surrogate = bottom 10 bits added to DC00 */
3774 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3775 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3776 }
3777#if SIZEOF_WCHAR_T == 2
3778 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003779#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 }
3782 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003783 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003784
Benjamin Peterson29060642009-01-31 22:14:21 +00003785 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003786 /* If this is not yet a resizable string, make it one.. */
3787 if (kind != PyUnicode_WCHAR_KIND) {
3788 const Py_UNICODE *u;
3789 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3790 if (!new_unicode)
3791 goto onError;
3792 u = PyUnicode_AsUnicode((PyObject *)unicode);
3793 if (!u)
3794 goto onError;
3795#if SIZEOF_WCHAR_T == 2
3796 i += wchar_offset;
3797#endif
3798 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3799 Py_DECREF(unicode);
3800 unicode = new_unicode;
3801 kind = 0;
3802 data = PyUnicode_AS_UNICODE(new_unicode);
3803 assert(data != NULL);
3804 }
3805 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003806 if (unicode_decode_call_errorhandler(
3807 errors, &errorHandler,
3808 "utf8", errmsg,
3809 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003811 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812 /* Update data because unicode_decode_call_errorhandler might have
3813 re-created or resized the unicode object. */
3814 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003815 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817 /* Ensure the unicode_size calculation above was correct: */
3818 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3819
Walter Dörwald69652032004-09-07 20:24:22 +00003820 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003821 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003823 /* Adjust length and ready string when it contained errors and
3824 is of the old resizable kind. */
3825 if (kind == PyUnicode_WCHAR_KIND) {
3826 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3827 PyUnicode_READY(unicode) == -1)
3828 goto onError;
3829 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003831 Py_XDECREF(errorHandler);
3832 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833 if (PyUnicode_READY(unicode) == -1) {
3834 Py_DECREF(unicode);
3835 return NULL;
3836 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 return (PyObject *)unicode;
3838
Benjamin Peterson29060642009-01-31 22:14:21 +00003839 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003840 Py_XDECREF(errorHandler);
3841 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842 Py_DECREF(unicode);
3843 return NULL;
3844}
3845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003846#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003847
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003848#ifdef __APPLE__
3849
3850/* Simplified UTF-8 decoder using surrogateescape error handler,
3851 used to decode the command line arguments on Mac OS X. */
3852
3853wchar_t*
3854_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3855{
3856 int n;
3857 const char *e;
3858 wchar_t *unicode, *p;
3859
3860 /* Note: size will always be longer than the resulting Unicode
3861 character count */
3862 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3863 PyErr_NoMemory();
3864 return NULL;
3865 }
3866 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3867 if (!unicode)
3868 return NULL;
3869
3870 /* Unpack UTF-8 encoded data */
3871 p = unicode;
3872 e = s + size;
3873 while (s < e) {
3874 Py_UCS4 ch = (unsigned char)*s;
3875
3876 if (ch < 0x80) {
3877 *p++ = (wchar_t)ch;
3878 s++;
3879 continue;
3880 }
3881
3882 n = utf8_code_length[ch];
3883 if (s + n > e) {
3884 goto surrogateescape;
3885 }
3886
3887 switch (n) {
3888 case 0:
3889 case 1:
3890 goto surrogateescape;
3891
3892 case 2:
3893 if ((s[1] & 0xc0) != 0x80)
3894 goto surrogateescape;
3895 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3896 assert ((ch > 0x007F) && (ch <= 0x07FF));
3897 *p++ = (wchar_t)ch;
3898 break;
3899
3900 case 3:
3901 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3902 will result in surrogates in range d800-dfff. Surrogates are
3903 not valid UTF-8 so they are rejected.
3904 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3905 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3906 if ((s[1] & 0xc0) != 0x80 ||
3907 (s[2] & 0xc0) != 0x80 ||
3908 ((unsigned char)s[0] == 0xE0 &&
3909 (unsigned char)s[1] < 0xA0) ||
3910 ((unsigned char)s[0] == 0xED &&
3911 (unsigned char)s[1] > 0x9F)) {
3912
3913 goto surrogateescape;
3914 }
3915 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3916 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003917 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003918 break;
3919
3920 case 4:
3921 if ((s[1] & 0xc0) != 0x80 ||
3922 (s[2] & 0xc0) != 0x80 ||
3923 (s[3] & 0xc0) != 0x80 ||
3924 ((unsigned char)s[0] == 0xF0 &&
3925 (unsigned char)s[1] < 0x90) ||
3926 ((unsigned char)s[0] == 0xF4 &&
3927 (unsigned char)s[1] > 0x8F)) {
3928 goto surrogateescape;
3929 }
3930 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3931 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3932 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3933
3934#if SIZEOF_WCHAR_T == 4
3935 *p++ = (wchar_t)ch;
3936#else
3937 /* compute and append the two surrogates: */
3938
3939 /* translate from 10000..10FFFF to 0..FFFF */
3940 ch -= 0x10000;
3941
3942 /* high surrogate = top 10 bits added to D800 */
3943 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3944
3945 /* low surrogate = bottom 10 bits added to DC00 */
3946 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3947#endif
3948 break;
3949 }
3950 s += n;
3951 continue;
3952
3953 surrogateescape:
3954 *p++ = 0xDC00 + ch;
3955 s++;
3956 }
3957 *p = L'\0';
3958 return unicode;
3959}
3960
3961#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963/* Primary internal function which creates utf8 encoded bytes objects.
3964
3965 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003966 and allocate exactly as much space needed at the end. Else allocate the
3967 maximum possible needed (4 result bytes per Unicode character), and return
3968 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003969*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003970PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972{
Tim Peters602f7402002-04-27 18:03:26 +00003973#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003974
Guido van Rossum98297ee2007-11-06 21:34:58 +00003975 Py_ssize_t i; /* index into s of next input byte */
3976 PyObject *result; /* result string object */
3977 char *p; /* next free byte in output buffer */
3978 Py_ssize_t nallocated; /* number of result bytes allocated */
3979 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003980 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003981 PyObject *errorHandler = NULL;
3982 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983 int kind;
3984 void *data;
3985 Py_ssize_t size;
3986 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3987#if SIZEOF_WCHAR_T == 2
3988 Py_ssize_t wchar_offset = 0;
3989#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003991 if (!PyUnicode_Check(unicode)) {
3992 PyErr_BadArgument();
3993 return NULL;
3994 }
3995
3996 if (PyUnicode_READY(unicode) == -1)
3997 return NULL;
3998
3999 if (_PyUnicode_UTF8(unicode))
4000 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
4001 _PyUnicode_UTF8_LENGTH(unicode));
4002
4003 kind = PyUnicode_KIND(unicode);
4004 data = PyUnicode_DATA(unicode);
4005 size = PyUnicode_GET_LENGTH(unicode);
4006
Tim Peters602f7402002-04-27 18:03:26 +00004007 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008
Tim Peters602f7402002-04-27 18:03:26 +00004009 if (size <= MAX_SHORT_UNICHARS) {
4010 /* Write into the stack buffer; nallocated can't overflow.
4011 * At the end, we'll allocate exactly as much heap space as it
4012 * turns out we need.
4013 */
4014 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004015 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004016 p = stackbuf;
4017 }
4018 else {
4019 /* Overallocate on the heap, and give the excess back at the end. */
4020 nallocated = size * 4;
4021 if (nallocated / 4 != size) /* overflow! */
4022 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004023 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004024 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004025 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004026 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004027 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004028
Tim Peters602f7402002-04-27 18:03:26 +00004029 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004031
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004032 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004033 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004037 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004038 *p++ = (char)(0xc0 | (ch >> 6));
4039 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004040 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041 Py_ssize_t newpos;
4042 PyObject *rep;
4043 Py_ssize_t repsize, k, startpos;
4044 startpos = i-1;
4045#if SIZEOF_WCHAR_T == 2
4046 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004047#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 rep = unicode_encode_call_errorhandler(
4049 errors, &errorHandler, "utf-8", "surrogates not allowed",
4050 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4051 &exc, startpos, startpos+1, &newpos);
4052 if (!rep)
4053 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055 if (PyBytes_Check(rep))
4056 repsize = PyBytes_GET_SIZE(rep);
4057 else
4058 repsize = PyUnicode_GET_SIZE(rep);
4059
4060 if (repsize > 4) {
4061 Py_ssize_t offset;
4062
4063 if (result == NULL)
4064 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004065 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4069 /* integer overflow */
4070 PyErr_NoMemory();
4071 goto error;
4072 }
4073 nallocated += repsize - 4;
4074 if (result != NULL) {
4075 if (_PyBytes_Resize(&result, nallocated) < 0)
4076 goto error;
4077 } else {
4078 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004079 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080 goto error;
4081 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4082 }
4083 p = PyBytes_AS_STRING(result) + offset;
4084 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004086 if (PyBytes_Check(rep)) {
4087 char *prep = PyBytes_AS_STRING(rep);
4088 for(k = repsize; k > 0; k--)
4089 *p++ = *prep++;
4090 } else /* rep is unicode */ {
4091 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4092 Py_UNICODE c;
4093
4094 for(k=0; k<repsize; k++) {
4095 c = prep[k];
4096 if (0x80 <= c) {
4097 raise_encode_exception(&exc, "utf-8",
4098 PyUnicode_AS_UNICODE(unicode),
4099 size, i-1, i,
4100 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004101 goto error;
4102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004103 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004104 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004105 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004106 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004107 } else if (ch < 0x10000) {
4108 *p++ = (char)(0xe0 | (ch >> 12));
4109 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4110 *p++ = (char)(0x80 | (ch & 0x3f));
4111 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004112 /* Encode UCS4 Unicode ordinals */
4113 *p++ = (char)(0xf0 | (ch >> 18));
4114 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4115 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4116 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004117#if SIZEOF_WCHAR_T == 2
4118 wchar_offset++;
4119#endif
Tim Peters602f7402002-04-27 18:03:26 +00004120 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004122
Guido van Rossum98297ee2007-11-06 21:34:58 +00004123 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004124 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004125 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004126 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004127 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004128 }
4129 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004130 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004131 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004132 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004133 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004134 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004135
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004136 Py_XDECREF(errorHandler);
4137 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004138 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004139 error:
4140 Py_XDECREF(errorHandler);
4141 Py_XDECREF(exc);
4142 Py_XDECREF(result);
4143 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004144
Tim Peters602f7402002-04-27 18:03:26 +00004145#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146}
4147
Alexander Belopolsky40018472011-02-26 01:02:56 +00004148PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004149PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4150 Py_ssize_t size,
4151 const char *errors)
4152{
4153 PyObject *v, *unicode;
4154
4155 unicode = PyUnicode_FromUnicode(s, size);
4156 if (unicode == NULL)
4157 return NULL;
4158 v = _PyUnicode_AsUTF8String(unicode, errors);
4159 Py_DECREF(unicode);
4160 return v;
4161}
4162
4163PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004164PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167}
4168
Walter Dörwald41980ca2007-08-16 21:55:45 +00004169/* --- UTF-32 Codec ------------------------------------------------------- */
4170
4171PyObject *
4172PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 Py_ssize_t size,
4174 const char *errors,
4175 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004176{
4177 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4178}
4179
4180PyObject *
4181PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 Py_ssize_t size,
4183 const char *errors,
4184 int *byteorder,
4185 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004186{
4187 const char *starts = s;
4188 Py_ssize_t startinpos;
4189 Py_ssize_t endinpos;
4190 Py_ssize_t outpos;
4191 PyUnicodeObject *unicode;
4192 Py_UNICODE *p;
4193#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004194 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004195 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004196#else
4197 const int pairs = 0;
4198#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004199 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004200 int bo = 0; /* assume native ordering by default */
4201 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004202 /* Offsets from q for retrieving bytes in the right order. */
4203#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4204 int iorder[] = {0, 1, 2, 3};
4205#else
4206 int iorder[] = {3, 2, 1, 0};
4207#endif
4208 PyObject *errorHandler = NULL;
4209 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004210
Walter Dörwald41980ca2007-08-16 21:55:45 +00004211 q = (unsigned char *)s;
4212 e = q + size;
4213
4214 if (byteorder)
4215 bo = *byteorder;
4216
4217 /* Check for BOM marks (U+FEFF) in the input and adjust current
4218 byte order setting accordingly. In native mode, the leading BOM
4219 mark is skipped, in all other modes, it is copied to the output
4220 stream as-is (giving a ZWNBSP character). */
4221 if (bo == 0) {
4222 if (size >= 4) {
4223 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004224 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004225#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 if (bom == 0x0000FEFF) {
4227 q += 4;
4228 bo = -1;
4229 }
4230 else if (bom == 0xFFFE0000) {
4231 q += 4;
4232 bo = 1;
4233 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004234#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004235 if (bom == 0x0000FEFF) {
4236 q += 4;
4237 bo = 1;
4238 }
4239 else if (bom == 0xFFFE0000) {
4240 q += 4;
4241 bo = -1;
4242 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004243#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004245 }
4246
4247 if (bo == -1) {
4248 /* force LE */
4249 iorder[0] = 0;
4250 iorder[1] = 1;
4251 iorder[2] = 2;
4252 iorder[3] = 3;
4253 }
4254 else if (bo == 1) {
4255 /* force BE */
4256 iorder[0] = 3;
4257 iorder[1] = 2;
4258 iorder[2] = 1;
4259 iorder[3] = 0;
4260 }
4261
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004262 /* On narrow builds we split characters outside the BMP into two
4263 codepoints => count how much extra space we need. */
4264#ifndef Py_UNICODE_WIDE
4265 for (qq = q; qq < e; qq += 4)
4266 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4267 pairs++;
4268#endif
4269
4270 /* This might be one to much, because of a BOM */
4271 unicode = _PyUnicode_New((size+3)/4+pairs);
4272 if (!unicode)
4273 return NULL;
4274 if (size == 0)
4275 return (PyObject *)unicode;
4276
4277 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004278 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004279
Walter Dörwald41980ca2007-08-16 21:55:45 +00004280 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004281 Py_UCS4 ch;
4282 /* remaining bytes at the end? (size should be divisible by 4) */
4283 if (e-q<4) {
4284 if (consumed)
4285 break;
4286 errmsg = "truncated data";
4287 startinpos = ((const char *)q)-starts;
4288 endinpos = ((const char *)e)-starts;
4289 goto utf32Error;
4290 /* The remaining input chars are ignored if the callback
4291 chooses to skip the input */
4292 }
4293 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4294 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004295
Benjamin Peterson29060642009-01-31 22:14:21 +00004296 if (ch >= 0x110000)
4297 {
4298 errmsg = "codepoint not in range(0x110000)";
4299 startinpos = ((const char *)q)-starts;
4300 endinpos = startinpos+4;
4301 goto utf32Error;
4302 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004303#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004304 if (ch >= 0x10000)
4305 {
4306 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4307 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4308 }
4309 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004310#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004311 *p++ = ch;
4312 q += 4;
4313 continue;
4314 utf32Error:
4315 outpos = p-PyUnicode_AS_UNICODE(unicode);
4316 if (unicode_decode_call_errorhandler(
4317 errors, &errorHandler,
4318 "utf32", errmsg,
4319 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4320 &unicode, &outpos, &p))
4321 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004322 }
4323
4324 if (byteorder)
4325 *byteorder = bo;
4326
4327 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004328 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004329
4330 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004331 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004332 goto onError;
4333
4334 Py_XDECREF(errorHandler);
4335 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004336 if (PyUnicode_READY(unicode) == -1) {
4337 Py_DECREF(unicode);
4338 return NULL;
4339 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004340 return (PyObject *)unicode;
4341
Benjamin Peterson29060642009-01-31 22:14:21 +00004342 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004343 Py_DECREF(unicode);
4344 Py_XDECREF(errorHandler);
4345 Py_XDECREF(exc);
4346 return NULL;
4347}
4348
4349PyObject *
4350PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004351 Py_ssize_t size,
4352 const char *errors,
4353 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004354{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004355 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004356 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004357 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004358#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004359 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004360#else
4361 const int pairs = 0;
4362#endif
4363 /* Offsets from p for storing byte pairs in the right order. */
4364#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4365 int iorder[] = {0, 1, 2, 3};
4366#else
4367 int iorder[] = {3, 2, 1, 0};
4368#endif
4369
Benjamin Peterson29060642009-01-31 22:14:21 +00004370#define STORECHAR(CH) \
4371 do { \
4372 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4373 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4374 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4375 p[iorder[0]] = (CH) & 0xff; \
4376 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004377 } while(0)
4378
4379 /* In narrow builds we can output surrogate pairs as one codepoint,
4380 so we need less space. */
4381#ifndef Py_UNICODE_WIDE
4382 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004383 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4384 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4385 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004386#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004387 nsize = (size - pairs + (byteorder == 0));
4388 bytesize = nsize * 4;
4389 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004391 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004392 if (v == NULL)
4393 return NULL;
4394
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004395 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004396 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004398 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004399 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004400
4401 if (byteorder == -1) {
4402 /* force LE */
4403 iorder[0] = 0;
4404 iorder[1] = 1;
4405 iorder[2] = 2;
4406 iorder[3] = 3;
4407 }
4408 else if (byteorder == 1) {
4409 /* force BE */
4410 iorder[0] = 3;
4411 iorder[1] = 2;
4412 iorder[2] = 1;
4413 iorder[3] = 0;
4414 }
4415
4416 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004418#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004419 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4420 Py_UCS4 ch2 = *s;
4421 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4422 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4423 s++;
4424 size--;
4425 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004426 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004427#endif
4428 STORECHAR(ch);
4429 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004430
4431 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004432 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004433#undef STORECHAR
4434}
4435
Alexander Belopolsky40018472011-02-26 01:02:56 +00004436PyObject *
4437PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004438{
4439 if (!PyUnicode_Check(unicode)) {
4440 PyErr_BadArgument();
4441 return NULL;
4442 }
4443 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004444 PyUnicode_GET_SIZE(unicode),
4445 NULL,
4446 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004447}
4448
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449/* --- UTF-16 Codec ------------------------------------------------------- */
4450
Tim Peters772747b2001-08-09 22:21:55 +00004451PyObject *
4452PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004453 Py_ssize_t size,
4454 const char *errors,
4455 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456{
Walter Dörwald69652032004-09-07 20:24:22 +00004457 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4458}
4459
Antoine Pitrouab868312009-01-10 15:40:25 +00004460/* Two masks for fast checking of whether a C 'long' may contain
4461 UTF16-encoded surrogate characters. This is an efficient heuristic,
4462 assuming that non-surrogate characters with a code point >= 0x8000 are
4463 rare in most input.
4464 FAST_CHAR_MASK is used when the input is in native byte ordering,
4465 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004466*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004467#if (SIZEOF_LONG == 8)
4468# define FAST_CHAR_MASK 0x8000800080008000L
4469# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4470#elif (SIZEOF_LONG == 4)
4471# define FAST_CHAR_MASK 0x80008000L
4472# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4473#else
4474# error C 'long' size should be either 4 or 8!
4475#endif
4476
Walter Dörwald69652032004-09-07 20:24:22 +00004477PyObject *
4478PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 Py_ssize_t size,
4480 const char *errors,
4481 int *byteorder,
4482 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004483{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004484 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004485 Py_ssize_t startinpos;
4486 Py_ssize_t endinpos;
4487 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 PyUnicodeObject *unicode;
4489 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004490 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004491 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004492 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004493 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004494 /* Offsets from q for retrieving byte pairs in the right order. */
4495#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4496 int ihi = 1, ilo = 0;
4497#else
4498 int ihi = 0, ilo = 1;
4499#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500 PyObject *errorHandler = NULL;
4501 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502
4503 /* Note: size will always be longer than the resulting Unicode
4504 character count */
4505 unicode = _PyUnicode_New(size);
4506 if (!unicode)
4507 return NULL;
4508 if (size == 0)
4509 return (PyObject *)unicode;
4510
4511 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004512 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004513 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004514 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515
4516 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004517 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004519 /* Check for BOM marks (U+FEFF) in the input and adjust current
4520 byte order setting accordingly. In native mode, the leading BOM
4521 mark is skipped, in all other modes, it is copied to the output
4522 stream as-is (giving a ZWNBSP character). */
4523 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004524 if (size >= 2) {
4525 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004526#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 if (bom == 0xFEFF) {
4528 q += 2;
4529 bo = -1;
4530 }
4531 else if (bom == 0xFFFE) {
4532 q += 2;
4533 bo = 1;
4534 }
Tim Petersced69f82003-09-16 20:30:58 +00004535#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004536 if (bom == 0xFEFF) {
4537 q += 2;
4538 bo = 1;
4539 }
4540 else if (bom == 0xFFFE) {
4541 q += 2;
4542 bo = -1;
4543 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004544#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547
Tim Peters772747b2001-08-09 22:21:55 +00004548 if (bo == -1) {
4549 /* force LE */
4550 ihi = 1;
4551 ilo = 0;
4552 }
4553 else if (bo == 1) {
4554 /* force BE */
4555 ihi = 0;
4556 ilo = 1;
4557 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004558#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4559 native_ordering = ilo < ihi;
4560#else
4561 native_ordering = ilo > ihi;
4562#endif
Tim Peters772747b2001-08-09 22:21:55 +00004563
Antoine Pitrouab868312009-01-10 15:40:25 +00004564 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004565 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004566 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004567 /* First check for possible aligned read of a C 'long'. Unaligned
4568 reads are more expensive, better to defer to another iteration. */
4569 if (!((size_t) q & LONG_PTR_MASK)) {
4570 /* Fast path for runs of non-surrogate chars. */
4571 register const unsigned char *_q = q;
4572 Py_UNICODE *_p = p;
4573 if (native_ordering) {
4574 /* Native ordering is simple: as long as the input cannot
4575 possibly contain a surrogate char, do an unrolled copy
4576 of several 16-bit code points to the target object.
4577 The non-surrogate check is done on several input bytes
4578 at a time (as many as a C 'long' can contain). */
4579 while (_q < aligned_end) {
4580 unsigned long data = * (unsigned long *) _q;
4581 if (data & FAST_CHAR_MASK)
4582 break;
4583 _p[0] = ((unsigned short *) _q)[0];
4584 _p[1] = ((unsigned short *) _q)[1];
4585#if (SIZEOF_LONG == 8)
4586 _p[2] = ((unsigned short *) _q)[2];
4587 _p[3] = ((unsigned short *) _q)[3];
4588#endif
4589 _q += SIZEOF_LONG;
4590 _p += SIZEOF_LONG / 2;
4591 }
4592 }
4593 else {
4594 /* Byteswapped ordering is similar, but we must decompose
4595 the copy bytewise, and take care of zero'ing out the
4596 upper bytes if the target object is in 32-bit units
4597 (that is, in UCS-4 builds). */
4598 while (_q < aligned_end) {
4599 unsigned long data = * (unsigned long *) _q;
4600 if (data & SWAPPED_FAST_CHAR_MASK)
4601 break;
4602 /* Zero upper bytes in UCS-4 builds */
4603#if (Py_UNICODE_SIZE > 2)
4604 _p[0] = 0;
4605 _p[1] = 0;
4606#if (SIZEOF_LONG == 8)
4607 _p[2] = 0;
4608 _p[3] = 0;
4609#endif
4610#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004611 /* Issue #4916; UCS-4 builds on big endian machines must
4612 fill the two last bytes of each 4-byte unit. */
4613#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4614# define OFF 2
4615#else
4616# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004617#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004618 ((unsigned char *) _p)[OFF + 1] = _q[0];
4619 ((unsigned char *) _p)[OFF + 0] = _q[1];
4620 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4621 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4622#if (SIZEOF_LONG == 8)
4623 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4624 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4625 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4626 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4627#endif
4628#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004629 _q += SIZEOF_LONG;
4630 _p += SIZEOF_LONG / 2;
4631 }
4632 }
4633 p = _p;
4634 q = _q;
4635 if (q >= e)
4636 break;
4637 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004638 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639
Benjamin Peterson14339b62009-01-31 16:36:08 +00004640 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004641
4642 if (ch < 0xD800 || ch > 0xDFFF) {
4643 *p++ = ch;
4644 continue;
4645 }
4646
4647 /* UTF-16 code pair: */
4648 if (q > e) {
4649 errmsg = "unexpected end of data";
4650 startinpos = (((const char *)q) - 2) - starts;
4651 endinpos = ((const char *)e) + 1 - starts;
4652 goto utf16Error;
4653 }
4654 if (0xD800 <= ch && ch <= 0xDBFF) {
4655 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4656 q += 2;
4657 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004658#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004659 *p++ = ch;
4660 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004661#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004662 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004663#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004664 continue;
4665 }
4666 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004667 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004668 startinpos = (((const char *)q)-4)-starts;
4669 endinpos = startinpos+2;
4670 goto utf16Error;
4671 }
4672
Benjamin Peterson14339b62009-01-31 16:36:08 +00004673 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 errmsg = "illegal encoding";
4675 startinpos = (((const char *)q)-2)-starts;
4676 endinpos = startinpos+2;
4677 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004678
Benjamin Peterson29060642009-01-31 22:14:21 +00004679 utf16Error:
4680 outpos = p - PyUnicode_AS_UNICODE(unicode);
4681 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004682 errors,
4683 &errorHandler,
4684 "utf16", errmsg,
4685 &starts,
4686 (const char **)&e,
4687 &startinpos,
4688 &endinpos,
4689 &exc,
4690 (const char **)&q,
4691 &unicode,
4692 &outpos,
4693 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004696 /* remaining byte at the end? (size should be even) */
4697 if (e == q) {
4698 if (!consumed) {
4699 errmsg = "truncated data";
4700 startinpos = ((const char *)q) - starts;
4701 endinpos = ((const char *)e) + 1 - starts;
4702 outpos = p - PyUnicode_AS_UNICODE(unicode);
4703 if (unicode_decode_call_errorhandler(
4704 errors,
4705 &errorHandler,
4706 "utf16", errmsg,
4707 &starts,
4708 (const char **)&e,
4709 &startinpos,
4710 &endinpos,
4711 &exc,
4712 (const char **)&q,
4713 &unicode,
4714 &outpos,
4715 &p))
4716 goto onError;
4717 /* The remaining input chars are ignored if the callback
4718 chooses to skip the input */
4719 }
4720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721
4722 if (byteorder)
4723 *byteorder = bo;
4724
Walter Dörwald69652032004-09-07 20:24:22 +00004725 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004726 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004727
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004729 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 goto onError;
4731
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 Py_XDECREF(errorHandler);
4733 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004734 if (PyUnicode_READY(unicode) == -1) {
4735 Py_DECREF(unicode);
4736 return NULL;
4737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738 return (PyObject *)unicode;
4739
Benjamin Peterson29060642009-01-31 22:14:21 +00004740 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 Py_XDECREF(errorHandler);
4743 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 return NULL;
4745}
4746
Antoine Pitrouab868312009-01-10 15:40:25 +00004747#undef FAST_CHAR_MASK
4748#undef SWAPPED_FAST_CHAR_MASK
4749
Tim Peters772747b2001-08-09 22:21:55 +00004750PyObject *
4751PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 Py_ssize_t size,
4753 const char *errors,
4754 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004756 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004757 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004758 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004759#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004760 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004761#else
4762 const int pairs = 0;
4763#endif
Tim Peters772747b2001-08-09 22:21:55 +00004764 /* Offsets from p for storing byte pairs in the right order. */
4765#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4766 int ihi = 1, ilo = 0;
4767#else
4768 int ihi = 0, ilo = 1;
4769#endif
4770
Benjamin Peterson29060642009-01-31 22:14:21 +00004771#define STORECHAR(CH) \
4772 do { \
4773 p[ihi] = ((CH) >> 8) & 0xff; \
4774 p[ilo] = (CH) & 0xff; \
4775 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004776 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004778#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004779 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 if (s[i] >= 0x10000)
4781 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004782#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004783 /* 2 * (size + pairs + (byteorder == 0)) */
4784 if (size > PY_SSIZE_T_MAX ||
4785 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004786 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004787 nsize = size + pairs + (byteorder == 0);
4788 bytesize = nsize * 2;
4789 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004790 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004791 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 if (v == NULL)
4793 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004795 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004797 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004798 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004799 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004800
4801 if (byteorder == -1) {
4802 /* force LE */
4803 ihi = 1;
4804 ilo = 0;
4805 }
4806 else if (byteorder == 1) {
4807 /* force BE */
4808 ihi = 0;
4809 ilo = 1;
4810 }
4811
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004812 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 Py_UNICODE ch = *s++;
4814 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004815#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004816 if (ch >= 0x10000) {
4817 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4818 ch = 0xD800 | ((ch-0x10000) >> 10);
4819 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004820#endif
Tim Peters772747b2001-08-09 22:21:55 +00004821 STORECHAR(ch);
4822 if (ch2)
4823 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004824 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004825
4826 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004827 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004828#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829}
4830
Alexander Belopolsky40018472011-02-26 01:02:56 +00004831PyObject *
4832PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833{
4834 if (!PyUnicode_Check(unicode)) {
4835 PyErr_BadArgument();
4836 return NULL;
4837 }
4838 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 PyUnicode_GET_SIZE(unicode),
4840 NULL,
4841 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842}
4843
4844/* --- Unicode Escape Codec ----------------------------------------------- */
4845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004846/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4847 if all the escapes in the string make it still a valid ASCII string.
4848 Returns -1 if any escapes were found which cause the string to
4849 pop out of ASCII range. Otherwise returns the length of the
4850 required buffer to hold the string.
4851 */
4852Py_ssize_t
4853length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4854{
4855 const unsigned char *p = (const unsigned char *)s;
4856 const unsigned char *end = p + size;
4857 Py_ssize_t length = 0;
4858
4859 if (size < 0)
4860 return -1;
4861
4862 for (; p < end; ++p) {
4863 if (*p > 127) {
4864 /* Non-ASCII */
4865 return -1;
4866 }
4867 else if (*p != '\\') {
4868 /* Normal character */
4869 ++length;
4870 }
4871 else {
4872 /* Backslash-escape, check next char */
4873 ++p;
4874 /* Escape sequence reaches till end of string or
4875 non-ASCII follow-up. */
4876 if (p >= end || *p > 127)
4877 return -1;
4878 switch (*p) {
4879 case '\n':
4880 /* backslash + \n result in zero characters */
4881 break;
4882 case '\\': case '\'': case '\"':
4883 case 'b': case 'f': case 't':
4884 case 'n': case 'r': case 'v': case 'a':
4885 ++length;
4886 break;
4887 case '0': case '1': case '2': case '3':
4888 case '4': case '5': case '6': case '7':
4889 case 'x': case 'u': case 'U': case 'N':
4890 /* these do not guarantee ASCII characters */
4891 return -1;
4892 default:
4893 /* count the backslash + the other character */
4894 length += 2;
4895 }
4896 }
4897 }
4898 return length;
4899}
4900
4901/* Similar to PyUnicode_WRITE but either write into wstr field
4902 or treat string as ASCII. */
4903#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4904 do { \
4905 if ((kind) != PyUnicode_WCHAR_KIND) \
4906 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4907 else \
4908 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4909 } while (0)
4910
4911#define WRITE_WSTR(buf, index, value) \
4912 assert(kind == PyUnicode_WCHAR_KIND), \
4913 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4914
4915
Fredrik Lundh06d12682001-01-24 07:59:11 +00004916static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004917
Alexander Belopolsky40018472011-02-26 01:02:56 +00004918PyObject *
4919PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004920 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004921 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004924 Py_ssize_t startinpos;
4925 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004926 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004928 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004930 char* message;
4931 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004932 PyObject *errorHandler = NULL;
4933 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004934 Py_ssize_t ascii_length;
4935 Py_ssize_t i;
4936 int kind;
4937 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004939 ascii_length = length_of_escaped_ascii_string(s, size);
4940
4941 /* After length_of_escaped_ascii_string() there are two alternatives,
4942 either the string is pure ASCII with named escapes like \n, etc.
4943 and we determined it's exact size (common case)
4944 or it contains \x, \u, ... escape sequences. then we create a
4945 legacy wchar string and resize it at the end of this function. */
4946 if (ascii_length >= 0) {
4947 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4948 if (!v)
4949 goto onError;
4950 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4951 kind = PyUnicode_1BYTE_KIND;
4952 data = PyUnicode_DATA(v);
4953 }
4954 else {
4955 /* Escaped strings will always be longer than the resulting
4956 Unicode string, so we start with size here and then reduce the
4957 length after conversion to the true value.
4958 (but if the error callback returns a long replacement string
4959 we'll have to allocate more space) */
4960 v = _PyUnicode_New(size);
4961 if (!v)
4962 goto onError;
4963 kind = PyUnicode_WCHAR_KIND;
4964 data = PyUnicode_AS_UNICODE(v);
4965 }
4966
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967 if (size == 0)
4968 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004969 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004971
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972 while (s < end) {
4973 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004974 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004975 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004977 if (kind == PyUnicode_WCHAR_KIND) {
4978 assert(i < _PyUnicode_WSTR_LENGTH(v));
4979 }
4980 else {
4981 /* The only case in which i == ascii_length is a backslash
4982 followed by a newline. */
4983 assert(i <= ascii_length);
4984 }
4985
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 /* Non-escape characters are interpreted as Unicode ordinals */
4987 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004988 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 continue;
4990 }
4991
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004992 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 /* \ - Escapes */
4994 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004995 c = *s++;
4996 if (s > end)
4997 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004998
4999 if (kind == PyUnicode_WCHAR_KIND) {
5000 assert(i < _PyUnicode_WSTR_LENGTH(v));
5001 }
5002 else {
5003 /* The only case in which i == ascii_length is a backslash
5004 followed by a newline. */
5005 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5006 }
5007
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005008 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005011 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005012 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5013 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5014 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5015 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5016 /* FF */
5017 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5018 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5019 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5020 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5021 /* VT */
5022 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5023 /* BEL, not classic C */
5024 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027 case '0': case '1': case '2': case '3':
5028 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005029 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005030 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005031 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005032 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005033 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005035 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 break;
5037
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 /* hex escapes */
5039 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005041 digits = 2;
5042 message = "truncated \\xXX escape";
5043 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005047 digits = 4;
5048 message = "truncated \\uXXXX escape";
5049 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005052 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005053 digits = 8;
5054 message = "truncated \\UXXXXXXXX escape";
5055 hexescape:
5056 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005057 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005058 if (s+digits>end) {
5059 endinpos = size;
5060 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 errors, &errorHandler,
5062 "unicodeescape", "end of string in escape sequence",
5063 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005064 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005065 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005066 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005067 goto nextByte;
5068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005069 for (j = 0; j < digits; ++j) {
5070 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005071 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005072 endinpos = (s+j+1)-starts;
5073 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005074 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 errors, &errorHandler,
5076 "unicodeescape", message,
5077 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005078 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005079 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005080 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005081 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005082 }
5083 chr = (chr<<4) & ~0xF;
5084 if (c >= '0' && c <= '9')
5085 chr += c - '0';
5086 else if (c >= 'a' && c <= 'f')
5087 chr += 10 + c - 'a';
5088 else
5089 chr += 10 + c - 'A';
5090 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005091 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005092 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005093 /* _decoding_error will have already written into the
5094 target buffer. */
5095 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005096 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005097 /* when we get here, chr is a 32-bit unicode character */
5098 if (chr <= 0xffff)
5099 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005100 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005101 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005102 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005103 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005104#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005105 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005106#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005107 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005108 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5109 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005110#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005111 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005113 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005115 errors, &errorHandler,
5116 "unicodeescape", "illegal Unicode character",
5117 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005118 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005119 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005120 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005121 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005122 break;
5123
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005125 case 'N':
5126 message = "malformed \\N character escape";
5127 if (ucnhash_CAPI == NULL) {
5128 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005129 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5130 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005131 if (ucnhash_CAPI == NULL)
5132 goto ucnhashError;
5133 }
5134 if (*s == '{') {
5135 const char *start = s+1;
5136 /* look for the closing brace */
5137 while (*s != '}' && s < end)
5138 s++;
5139 if (s > start && s < end && *s == '}') {
5140 /* found a name. look it up in the unicode database */
5141 message = "unknown Unicode character name";
5142 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005143 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5144 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005145 goto store;
5146 }
5147 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005148 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005149 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005150 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005151 errors, &errorHandler,
5152 "unicodeescape", message,
5153 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005154 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005155 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005156 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005157 break;
5158
5159 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005160 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005161 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005162 message = "\\ at end of string";
5163 s--;
5164 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005165 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005166 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 errors, &errorHandler,
5168 "unicodeescape", message,
5169 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005170 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005171 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005172 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005173 }
5174 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005175 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5176 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005177 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005178 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005181 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005183 /* Ensure the length prediction worked in case of ASCII strings */
5184 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5185
5186 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5187 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005188 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005189 Py_XDECREF(errorHandler);
5190 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005192
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005194 PyErr_SetString(
5195 PyExc_UnicodeError,
5196 "\\N escapes not supported (can't load unicodedata module)"
5197 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005198 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005199 Py_XDECREF(errorHandler);
5200 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005201 return NULL;
5202
Benjamin Peterson29060642009-01-31 22:14:21 +00005203 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205 Py_XDECREF(errorHandler);
5206 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 return NULL;
5208}
5209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005210#undef WRITE_ASCII_OR_WSTR
5211#undef WRITE_WSTR
5212
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213/* Return a Unicode-Escape string version of the Unicode object.
5214
5215 If quotes is true, the string is enclosed in u"" or u'' quotes as
5216 appropriate.
5217
5218*/
5219
Walter Dörwald79e913e2007-05-12 11:08:06 +00005220static const char *hexdigits = "0123456789abcdef";
5221
Alexander Belopolsky40018472011-02-26 01:02:56 +00005222PyObject *
5223PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005224 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005226 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005229#ifdef Py_UNICODE_WIDE
5230 const Py_ssize_t expandsize = 10;
5231#else
5232 const Py_ssize_t expandsize = 6;
5233#endif
5234
Thomas Wouters89f507f2006-12-13 04:49:30 +00005235 /* XXX(nnorwitz): rather than over-allocating, it would be
5236 better to choose a different scheme. Perhaps scan the
5237 first N-chars of the string and allocate based on that size.
5238 */
5239 /* Initial allocation is based on the longest-possible unichr
5240 escape.
5241
5242 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5243 unichr, so in this case it's the longest unichr escape. In
5244 narrow (UTF-16) builds this is five chars per source unichr
5245 since there are two unichrs in the surrogate pair, so in narrow
5246 (UTF-16) builds it's not the longest unichr escape.
5247
5248 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5249 so in the narrow (UTF-16) build case it's the longest unichr
5250 escape.
5251 */
5252
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005253 if (size == 0)
5254 return PyBytes_FromStringAndSize(NULL, 0);
5255
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005256 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005258
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005259 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 2
5261 + expandsize*size
5262 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 if (repr == NULL)
5264 return NULL;
5265
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005266 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 while (size-- > 0) {
5269 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005270
Walter Dörwald79e913e2007-05-12 11:08:06 +00005271 /* Escape backslashes */
5272 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 *p++ = '\\';
5274 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005275 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005276 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005277
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005278#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005279 /* Map 21-bit characters to '\U00xxxxxx' */
5280 else if (ch >= 0x10000) {
5281 *p++ = '\\';
5282 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005283 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5284 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5285 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5286 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5287 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5288 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5289 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5290 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005292 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005293#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005294 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5295 else if (ch >= 0xD800 && ch < 0xDC00) {
5296 Py_UNICODE ch2;
5297 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005298
Benjamin Peterson29060642009-01-31 22:14:21 +00005299 ch2 = *s++;
5300 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005301 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5303 *p++ = '\\';
5304 *p++ = 'U';
5305 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5306 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5307 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5308 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5309 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5310 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5311 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5312 *p++ = hexdigits[ucs & 0x0000000F];
5313 continue;
5314 }
5315 /* Fall through: isolated surrogates are copied as-is */
5316 s--;
5317 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005318 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005319#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005320
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005322 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 *p++ = '\\';
5324 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005325 *p++ = hexdigits[(ch >> 12) & 0x000F];
5326 *p++ = hexdigits[(ch >> 8) & 0x000F];
5327 *p++ = hexdigits[(ch >> 4) & 0x000F];
5328 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005330
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005331 /* Map special whitespace to '\t', \n', '\r' */
5332 else if (ch == '\t') {
5333 *p++ = '\\';
5334 *p++ = 't';
5335 }
5336 else if (ch == '\n') {
5337 *p++ = '\\';
5338 *p++ = 'n';
5339 }
5340 else if (ch == '\r') {
5341 *p++ = '\\';
5342 *p++ = 'r';
5343 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005344
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005345 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005346 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005348 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005349 *p++ = hexdigits[(ch >> 4) & 0x000F];
5350 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005351 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005352
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 /* Copy everything else as-is */
5354 else
5355 *p++ = (char) ch;
5356 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005358 assert(p - PyBytes_AS_STRING(repr) > 0);
5359 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5360 return NULL;
5361 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362}
5363
Alexander Belopolsky40018472011-02-26 01:02:56 +00005364PyObject *
5365PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005367 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 if (!PyUnicode_Check(unicode)) {
5369 PyErr_BadArgument();
5370 return NULL;
5371 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005372 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5373 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005374 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375}
5376
5377/* --- Raw Unicode Escape Codec ------------------------------------------- */
5378
Alexander Belopolsky40018472011-02-26 01:02:56 +00005379PyObject *
5380PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005381 Py_ssize_t size,
5382 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005385 Py_ssize_t startinpos;
5386 Py_ssize_t endinpos;
5387 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 const char *end;
5391 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005392 PyObject *errorHandler = NULL;
5393 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005394
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 /* Escaped strings will always be longer than the resulting
5396 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005397 length after conversion to the true value. (But decoding error
5398 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 v = _PyUnicode_New(size);
5400 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005404 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 end = s + size;
5406 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 unsigned char c;
5408 Py_UCS4 x;
5409 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005410 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 /* Non-escape characters are interpreted as Unicode ordinals */
5413 if (*s != '\\') {
5414 *p++ = (unsigned char)*s++;
5415 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005416 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 startinpos = s-starts;
5418
5419 /* \u-escapes are only interpreted iff the number of leading
5420 backslashes if odd */
5421 bs = s;
5422 for (;s < end;) {
5423 if (*s != '\\')
5424 break;
5425 *p++ = (unsigned char)*s++;
5426 }
5427 if (((s - bs) & 1) == 0 ||
5428 s >= end ||
5429 (*s != 'u' && *s != 'U')) {
5430 continue;
5431 }
5432 p--;
5433 count = *s=='u' ? 4 : 8;
5434 s++;
5435
5436 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5437 outpos = p-PyUnicode_AS_UNICODE(v);
5438 for (x = 0, i = 0; i < count; ++i, ++s) {
5439 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005440 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 endinpos = s-starts;
5442 if (unicode_decode_call_errorhandler(
5443 errors, &errorHandler,
5444 "rawunicodeescape", "truncated \\uXXXX",
5445 &starts, &end, &startinpos, &endinpos, &exc, &s,
5446 &v, &outpos, &p))
5447 goto onError;
5448 goto nextByte;
5449 }
5450 x = (x<<4) & ~0xF;
5451 if (c >= '0' && c <= '9')
5452 x += c - '0';
5453 else if (c >= 'a' && c <= 'f')
5454 x += 10 + c - 'a';
5455 else
5456 x += 10 + c - 'A';
5457 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005458 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005459 /* UCS-2 character */
5460 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005461 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005462 /* UCS-4 character. Either store directly, or as
5463 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005464#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005466#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 x -= 0x10000L;
5468 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5469 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005470#endif
5471 } else {
5472 endinpos = s-starts;
5473 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005474 if (unicode_decode_call_errorhandler(
5475 errors, &errorHandler,
5476 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005477 &starts, &end, &startinpos, &endinpos, &exc, &s,
5478 &v, &outpos, &p))
5479 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005480 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 nextByte:
5482 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005484 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005486 Py_XDECREF(errorHandler);
5487 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005488 if (PyUnicode_READY(v) == -1) {
5489 Py_DECREF(v);
5490 return NULL;
5491 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005493
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005496 Py_XDECREF(errorHandler);
5497 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 return NULL;
5499}
5500
Alexander Belopolsky40018472011-02-26 01:02:56 +00005501PyObject *
5502PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005503 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005505 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 char *p;
5507 char *q;
5508
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005509#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005510 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005511#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005512 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005513#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005514
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005515 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005516 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005517
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005518 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 if (repr == NULL)
5520 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005521 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005522 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005524 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 while (size-- > 0) {
5526 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005527#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 /* Map 32-bit characters to '\Uxxxxxxxx' */
5529 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005530 *p++ = '\\';
5531 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005532 *p++ = hexdigits[(ch >> 28) & 0xf];
5533 *p++ = hexdigits[(ch >> 24) & 0xf];
5534 *p++ = hexdigits[(ch >> 20) & 0xf];
5535 *p++ = hexdigits[(ch >> 16) & 0xf];
5536 *p++ = hexdigits[(ch >> 12) & 0xf];
5537 *p++ = hexdigits[(ch >> 8) & 0xf];
5538 *p++ = hexdigits[(ch >> 4) & 0xf];
5539 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005540 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005541 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005542#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5544 if (ch >= 0xD800 && ch < 0xDC00) {
5545 Py_UNICODE ch2;
5546 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005547
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 ch2 = *s++;
5549 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005550 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5552 *p++ = '\\';
5553 *p++ = 'U';
5554 *p++ = hexdigits[(ucs >> 28) & 0xf];
5555 *p++ = hexdigits[(ucs >> 24) & 0xf];
5556 *p++ = hexdigits[(ucs >> 20) & 0xf];
5557 *p++ = hexdigits[(ucs >> 16) & 0xf];
5558 *p++ = hexdigits[(ucs >> 12) & 0xf];
5559 *p++ = hexdigits[(ucs >> 8) & 0xf];
5560 *p++ = hexdigits[(ucs >> 4) & 0xf];
5561 *p++ = hexdigits[ucs & 0xf];
5562 continue;
5563 }
5564 /* Fall through: isolated surrogates are copied as-is */
5565 s--;
5566 size++;
5567 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005568#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005569 /* Map 16-bit characters to '\uxxxx' */
5570 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 *p++ = '\\';
5572 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005573 *p++ = hexdigits[(ch >> 12) & 0xf];
5574 *p++ = hexdigits[(ch >> 8) & 0xf];
5575 *p++ = hexdigits[(ch >> 4) & 0xf];
5576 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 /* Copy everything else as-is */
5579 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 *p++ = (char) ch;
5581 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005582 size = p - q;
5583
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005584 assert(size > 0);
5585 if (_PyBytes_Resize(&repr, size) < 0)
5586 return NULL;
5587 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588}
5589
Alexander Belopolsky40018472011-02-26 01:02:56 +00005590PyObject *
5591PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005593 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005595 PyErr_BadArgument();
5596 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005598 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5599 PyUnicode_GET_SIZE(unicode));
5600
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005601 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602}
5603
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005604/* --- Unicode Internal Codec ------------------------------------------- */
5605
Alexander Belopolsky40018472011-02-26 01:02:56 +00005606PyObject *
5607_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005608 Py_ssize_t size,
5609 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005610{
5611 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005612 Py_ssize_t startinpos;
5613 Py_ssize_t endinpos;
5614 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005615 PyUnicodeObject *v;
5616 Py_UNICODE *p;
5617 const char *end;
5618 const char *reason;
5619 PyObject *errorHandler = NULL;
5620 PyObject *exc = NULL;
5621
Neal Norwitzd43069c2006-01-08 01:12:10 +00005622#ifdef Py_UNICODE_WIDE
5623 Py_UNICODE unimax = PyUnicode_GetMax();
5624#endif
5625
Thomas Wouters89f507f2006-12-13 04:49:30 +00005626 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005627 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5628 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005630 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5631 as string was created with the old API. */
5632 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005634 p = PyUnicode_AS_UNICODE(v);
5635 end = s + size;
5636
5637 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005638 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005639 /* We have to sanity check the raw data, otherwise doom looms for
5640 some malformed UCS-4 data. */
5641 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005642#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005643 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005644#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005645 end-s < Py_UNICODE_SIZE
5646 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005648 startinpos = s - starts;
5649 if (end-s < Py_UNICODE_SIZE) {
5650 endinpos = end-starts;
5651 reason = "truncated input";
5652 }
5653 else {
5654 endinpos = s - starts + Py_UNICODE_SIZE;
5655 reason = "illegal code point (> 0x10FFFF)";
5656 }
5657 outpos = p - PyUnicode_AS_UNICODE(v);
5658 if (unicode_decode_call_errorhandler(
5659 errors, &errorHandler,
5660 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005661 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005662 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005663 goto onError;
5664 }
5665 }
5666 else {
5667 p++;
5668 s += Py_UNICODE_SIZE;
5669 }
5670 }
5671
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005672 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005673 goto onError;
5674 Py_XDECREF(errorHandler);
5675 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005676 if (PyUnicode_READY(v) == -1) {
5677 Py_DECREF(v);
5678 return NULL;
5679 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005680 return (PyObject *)v;
5681
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005683 Py_XDECREF(v);
5684 Py_XDECREF(errorHandler);
5685 Py_XDECREF(exc);
5686 return NULL;
5687}
5688
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689/* --- Latin-1 Codec ------------------------------------------------------ */
5690
Alexander Belopolsky40018472011-02-26 01:02:56 +00005691PyObject *
5692PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005693 Py_ssize_t size,
5694 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005697 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698}
5699
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005700/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005701static void
5702make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005703 const char *encoding,
5704 const Py_UNICODE *unicode, Py_ssize_t size,
5705 Py_ssize_t startpos, Py_ssize_t endpos,
5706 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005708 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 *exceptionObject = PyUnicodeEncodeError_Create(
5710 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 }
5712 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5714 goto onError;
5715 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5716 goto onError;
5717 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5718 goto onError;
5719 return;
5720 onError:
5721 Py_DECREF(*exceptionObject);
5722 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 }
5724}
5725
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005727static void
5728raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005729 const char *encoding,
5730 const Py_UNICODE *unicode, Py_ssize_t size,
5731 Py_ssize_t startpos, Py_ssize_t endpos,
5732 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005733{
5734 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005736 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005738}
5739
5740/* error handling callback helper:
5741 build arguments, call the callback and check the arguments,
5742 put the result into newpos and return the replacement string, which
5743 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005744static PyObject *
5745unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005746 PyObject **errorHandler,
5747 const char *encoding, const char *reason,
5748 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5749 Py_ssize_t startpos, Py_ssize_t endpos,
5750 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005751{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005752 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753
5754 PyObject *restuple;
5755 PyObject *resunicode;
5756
5757 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005758 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005759 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 }
5762
5763 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005765 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767
5768 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005772 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005773 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 Py_DECREF(restuple);
5775 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005777 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 &resunicode, newpos)) {
5779 Py_DECREF(restuple);
5780 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005782 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5783 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5784 Py_DECREF(restuple);
5785 return NULL;
5786 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005787 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005789 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5791 Py_DECREF(restuple);
5792 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005793 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005794 Py_INCREF(resunicode);
5795 Py_DECREF(restuple);
5796 return resunicode;
5797}
5798
Alexander Belopolsky40018472011-02-26 01:02:56 +00005799static PyObject *
5800unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005801 Py_ssize_t size,
5802 const char *errors,
5803 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804{
5805 /* output object */
5806 PyObject *res;
5807 /* pointers to the beginning and end+1 of input */
5808 const Py_UNICODE *startp = p;
5809 const Py_UNICODE *endp = p + size;
5810 /* pointer to the beginning of the unencodable characters */
5811 /* const Py_UNICODE *badp = NULL; */
5812 /* pointer into the output */
5813 char *str;
5814 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005815 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005816 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5817 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 PyObject *errorHandler = NULL;
5819 PyObject *exc = NULL;
5820 /* the following variable is used for caching string comparisons
5821 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5822 int known_errorHandler = -1;
5823
5824 /* allocate enough for a simple encoding without
5825 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005826 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005827 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005828 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005830 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005831 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005832 ressize = size;
5833
5834 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005836
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 /* can we encode this? */
5838 if (c<limit) {
5839 /* no overflow check, because we know that the space is enough */
5840 *str++ = (char)c;
5841 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005842 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 else {
5844 Py_ssize_t unicodepos = p-startp;
5845 Py_ssize_t requiredsize;
5846 PyObject *repunicode;
5847 Py_ssize_t repsize;
5848 Py_ssize_t newpos;
5849 Py_ssize_t respos;
5850 Py_UNICODE *uni2;
5851 /* startpos for collecting unencodable chars */
5852 const Py_UNICODE *collstart = p;
5853 const Py_UNICODE *collend = p;
5854 /* find all unecodable characters */
5855 while ((collend < endp) && ((*collend)>=limit))
5856 ++collend;
5857 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5858 if (known_errorHandler==-1) {
5859 if ((errors==NULL) || (!strcmp(errors, "strict")))
5860 known_errorHandler = 1;
5861 else if (!strcmp(errors, "replace"))
5862 known_errorHandler = 2;
5863 else if (!strcmp(errors, "ignore"))
5864 known_errorHandler = 3;
5865 else if (!strcmp(errors, "xmlcharrefreplace"))
5866 known_errorHandler = 4;
5867 else
5868 known_errorHandler = 0;
5869 }
5870 switch (known_errorHandler) {
5871 case 1: /* strict */
5872 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5873 goto onError;
5874 case 2: /* replace */
5875 while (collstart++<collend)
5876 *str++ = '?'; /* fall through */
5877 case 3: /* ignore */
5878 p = collend;
5879 break;
5880 case 4: /* xmlcharrefreplace */
5881 respos = str - PyBytes_AS_STRING(res);
5882 /* determine replacement size (temporarily (mis)uses p) */
5883 for (p = collstart, repsize = 0; p < collend; ++p) {
5884 if (*p<10)
5885 repsize += 2+1+1;
5886 else if (*p<100)
5887 repsize += 2+2+1;
5888 else if (*p<1000)
5889 repsize += 2+3+1;
5890 else if (*p<10000)
5891 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005892#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 else
5894 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005895#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 else if (*p<100000)
5897 repsize += 2+5+1;
5898 else if (*p<1000000)
5899 repsize += 2+6+1;
5900 else
5901 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005902#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 }
5904 requiredsize = respos+repsize+(endp-collend);
5905 if (requiredsize > ressize) {
5906 if (requiredsize<2*ressize)
5907 requiredsize = 2*ressize;
5908 if (_PyBytes_Resize(&res, requiredsize))
5909 goto onError;
5910 str = PyBytes_AS_STRING(res) + respos;
5911 ressize = requiredsize;
5912 }
5913 /* generate replacement (temporarily (mis)uses p) */
5914 for (p = collstart; p < collend; ++p) {
5915 str += sprintf(str, "&#%d;", (int)*p);
5916 }
5917 p = collend;
5918 break;
5919 default:
5920 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5921 encoding, reason, startp, size, &exc,
5922 collstart-startp, collend-startp, &newpos);
5923 if (repunicode == NULL)
5924 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005925 if (PyBytes_Check(repunicode)) {
5926 /* Directly copy bytes result to output. */
5927 repsize = PyBytes_Size(repunicode);
5928 if (repsize > 1) {
5929 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005930 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005931 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5932 Py_DECREF(repunicode);
5933 goto onError;
5934 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005935 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005936 ressize += repsize-1;
5937 }
5938 memcpy(str, PyBytes_AsString(repunicode), repsize);
5939 str += repsize;
5940 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005941 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005942 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005943 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 /* need more space? (at least enough for what we
5945 have+the replacement+the rest of the string, so
5946 we won't have to check space for encodable characters) */
5947 respos = str - PyBytes_AS_STRING(res);
5948 repsize = PyUnicode_GET_SIZE(repunicode);
5949 requiredsize = respos+repsize+(endp-collend);
5950 if (requiredsize > ressize) {
5951 if (requiredsize<2*ressize)
5952 requiredsize = 2*ressize;
5953 if (_PyBytes_Resize(&res, requiredsize)) {
5954 Py_DECREF(repunicode);
5955 goto onError;
5956 }
5957 str = PyBytes_AS_STRING(res) + respos;
5958 ressize = requiredsize;
5959 }
5960 /* check if there is anything unencodable in the replacement
5961 and copy it to the output */
5962 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5963 c = *uni2;
5964 if (c >= limit) {
5965 raise_encode_exception(&exc, encoding, startp, size,
5966 unicodepos, unicodepos+1, reason);
5967 Py_DECREF(repunicode);
5968 goto onError;
5969 }
5970 *str = (char)c;
5971 }
5972 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005973 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005974 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005975 }
5976 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005977 /* Resize if we allocated to much */
5978 size = str - PyBytes_AS_STRING(res);
5979 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005980 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005981 if (_PyBytes_Resize(&res, size) < 0)
5982 goto onError;
5983 }
5984
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005985 Py_XDECREF(errorHandler);
5986 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005987 return res;
5988
5989 onError:
5990 Py_XDECREF(res);
5991 Py_XDECREF(errorHandler);
5992 Py_XDECREF(exc);
5993 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994}
5995
Alexander Belopolsky40018472011-02-26 01:02:56 +00005996PyObject *
5997PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005998 Py_ssize_t size,
5999 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006001 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002}
6003
Alexander Belopolsky40018472011-02-26 01:02:56 +00006004PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006005_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006{
6007 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 PyErr_BadArgument();
6009 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006011 if (PyUnicode_READY(unicode) == -1)
6012 return NULL;
6013 /* Fast path: if it is a one-byte string, construct
6014 bytes object directly. */
6015 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6016 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6017 PyUnicode_GET_LENGTH(unicode));
6018 /* Non-Latin-1 characters present. Defer to above function to
6019 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006022 errors);
6023}
6024
6025PyObject*
6026PyUnicode_AsLatin1String(PyObject *unicode)
6027{
6028 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029}
6030
6031/* --- 7-bit ASCII Codec -------------------------------------------------- */
6032
Alexander Belopolsky40018472011-02-26 01:02:56 +00006033PyObject *
6034PyUnicode_DecodeASCII(const char *s,
6035 Py_ssize_t size,
6036 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006038 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 PyUnicodeObject *v;
6040 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006041 Py_ssize_t startinpos;
6042 Py_ssize_t endinpos;
6043 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006045 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006046 PyObject *errorHandler = NULL;
6047 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006048 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006049
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006051 if (size == 1 && *(unsigned char*)s < 128)
6052 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6053
6054 /* Fast path. Assume the input actually *is* ASCII, and allocate
6055 a single-block Unicode object with that assumption. If there is
6056 an error, drop the object and start over. */
6057 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6058 if (v == NULL)
6059 goto onError;
6060 d = PyUnicode_1BYTE_DATA(v);
6061 for (i = 0; i < size; i++) {
6062 unsigned char ch = ((unsigned char*)s)[i];
6063 if (ch < 128)
6064 d[i] = ch;
6065 else
6066 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006067 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006068 if (i == size)
6069 return (PyObject*)v;
6070 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006071
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 v = _PyUnicode_New(size);
6073 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006074 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006078 e = s + size;
6079 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 register unsigned char c = (unsigned char)*s;
6081 if (c < 128) {
6082 *p++ = c;
6083 ++s;
6084 }
6085 else {
6086 startinpos = s-starts;
6087 endinpos = startinpos + 1;
6088 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6089 if (unicode_decode_call_errorhandler(
6090 errors, &errorHandler,
6091 "ascii", "ordinal not in range(128)",
6092 &starts, &e, &startinpos, &endinpos, &exc, &s,
6093 &v, &outpos, &p))
6094 goto onError;
6095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006097 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6099 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006100 Py_XDECREF(errorHandler);
6101 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006102 if (PyUnicode_READY(v) == -1) {
6103 Py_DECREF(v);
6104 return NULL;
6105 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006107
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006110 Py_XDECREF(errorHandler);
6111 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 return NULL;
6113}
6114
Alexander Belopolsky40018472011-02-26 01:02:56 +00006115PyObject *
6116PyUnicode_EncodeASCII(const Py_UNICODE *p,
6117 Py_ssize_t size,
6118 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121}
6122
Alexander Belopolsky40018472011-02-26 01:02:56 +00006123PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006124_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125{
6126 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 PyErr_BadArgument();
6128 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006130 if (PyUnicode_READY(unicode) == -1)
6131 return NULL;
6132 /* Fast path: if it is an ASCII-only string, construct bytes object
6133 directly. Else defer to above function to raise the exception. */
6134 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6135 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6136 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006139 errors);
6140}
6141
6142PyObject *
6143PyUnicode_AsASCIIString(PyObject *unicode)
6144{
6145 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146}
6147
Victor Stinner99b95382011-07-04 14:23:54 +02006148#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006149
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006150/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006151
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006152#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006153#define NEED_RETRY
6154#endif
6155
6156/* XXX This code is limited to "true" double-byte encodings, as
6157 a) it assumes an incomplete character consists of a single byte, and
6158 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006160
Alexander Belopolsky40018472011-02-26 01:02:56 +00006161static int
6162is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006163{
6164 const char *curr = s + offset;
6165
6166 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 const char *prev = CharPrev(s, curr);
6168 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006169 }
6170 return 0;
6171}
6172
6173/*
6174 * Decode MBCS string into unicode object. If 'final' is set, converts
6175 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6176 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006177static int
6178decode_mbcs(PyUnicodeObject **v,
6179 const char *s, /* MBCS string */
6180 int size, /* sizeof MBCS string */
6181 int final,
6182 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006183{
6184 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006185 Py_ssize_t n;
6186 DWORD usize;
6187 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006188
6189 assert(size >= 0);
6190
Victor Stinner554f3f02010-06-16 23:33:54 +00006191 /* check and handle 'errors' arg */
6192 if (errors==NULL || strcmp(errors, "strict")==0)
6193 flags = MB_ERR_INVALID_CHARS;
6194 else if (strcmp(errors, "ignore")==0)
6195 flags = 0;
6196 else {
6197 PyErr_Format(PyExc_ValueError,
6198 "mbcs encoding does not support errors='%s'",
6199 errors);
6200 return -1;
6201 }
6202
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006203 /* Skip trailing lead-byte unless 'final' is set */
6204 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006206
6207 /* First get the size of the result */
6208 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006209 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6210 if (usize==0)
6211 goto mbcs_decode_error;
6212 } else
6213 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006214
6215 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 /* Create unicode object */
6217 *v = _PyUnicode_New(usize);
6218 if (*v == NULL)
6219 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006220 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006221 }
6222 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 /* Extend unicode object */
6224 n = PyUnicode_GET_SIZE(*v);
6225 if (_PyUnicode_Resize(v, n + usize) < 0)
6226 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006227 }
6228
6229 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006230 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006232 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6233 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006235 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006236 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006237
6238mbcs_decode_error:
6239 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6240 we raise a UnicodeDecodeError - else it is a 'generic'
6241 windows error
6242 */
6243 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6244 /* Ideally, we should get reason from FormatMessage - this
6245 is the Windows 2000 English version of the message
6246 */
6247 PyObject *exc = NULL;
6248 const char *reason = "No mapping for the Unicode character exists "
6249 "in the target multi-byte code page.";
6250 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6251 if (exc != NULL) {
6252 PyCodec_StrictErrors(exc);
6253 Py_DECREF(exc);
6254 }
6255 } else {
6256 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6257 }
6258 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006259}
6260
Alexander Belopolsky40018472011-02-26 01:02:56 +00006261PyObject *
6262PyUnicode_DecodeMBCSStateful(const char *s,
6263 Py_ssize_t size,
6264 const char *errors,
6265 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006266{
6267 PyUnicodeObject *v = NULL;
6268 int done;
6269
6270 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006272
6273#ifdef NEED_RETRY
6274 retry:
6275 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006276 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006277 else
6278#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006279 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006280
6281 if (done < 0) {
6282 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006284 }
6285
6286 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006288
6289#ifdef NEED_RETRY
6290 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 s += done;
6292 size -= done;
6293 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006294 }
6295#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006296 if (PyUnicode_READY(v) == -1) {
6297 Py_DECREF(v);
6298 return NULL;
6299 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006300 return (PyObject *)v;
6301}
6302
Alexander Belopolsky40018472011-02-26 01:02:56 +00006303PyObject *
6304PyUnicode_DecodeMBCS(const char *s,
6305 Py_ssize_t size,
6306 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006307{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006308 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6309}
6310
6311/*
6312 * Convert unicode into string object (MBCS).
6313 * Returns 0 if succeed, -1 otherwise.
6314 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006315static int
6316encode_mbcs(PyObject **repr,
6317 const Py_UNICODE *p, /* unicode */
6318 int size, /* size of unicode */
6319 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006320{
Victor Stinner554f3f02010-06-16 23:33:54 +00006321 BOOL usedDefaultChar = FALSE;
6322 BOOL *pusedDefaultChar;
6323 int mbcssize;
6324 Py_ssize_t n;
6325 PyObject *exc = NULL;
6326 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006327
6328 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006329
Victor Stinner554f3f02010-06-16 23:33:54 +00006330 /* check and handle 'errors' arg */
6331 if (errors==NULL || strcmp(errors, "strict")==0) {
6332 flags = WC_NO_BEST_FIT_CHARS;
6333 pusedDefaultChar = &usedDefaultChar;
6334 } else if (strcmp(errors, "replace")==0) {
6335 flags = 0;
6336 pusedDefaultChar = NULL;
6337 } else {
6338 PyErr_Format(PyExc_ValueError,
6339 "mbcs encoding does not support errors='%s'",
6340 errors);
6341 return -1;
6342 }
6343
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006344 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006345 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006346 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6347 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 if (mbcssize == 0) {
6349 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6350 return -1;
6351 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006352 /* If we used a default char, then we failed! */
6353 if (pusedDefaultChar && *pusedDefaultChar)
6354 goto mbcs_encode_error;
6355 } else {
6356 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006357 }
6358
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006359 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 /* Create string object */
6361 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6362 if (*repr == NULL)
6363 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006364 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006365 }
6366 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 /* Extend string object */
6368 n = PyBytes_Size(*repr);
6369 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6370 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006371 }
6372
6373 /* Do the conversion */
6374 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006376 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6377 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6379 return -1;
6380 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006381 if (pusedDefaultChar && *pusedDefaultChar)
6382 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006383 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006384 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006385
6386mbcs_encode_error:
6387 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6388 Py_XDECREF(exc);
6389 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006390}
6391
Alexander Belopolsky40018472011-02-26 01:02:56 +00006392PyObject *
6393PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6394 Py_ssize_t size,
6395 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006396{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006397 PyObject *repr = NULL;
6398 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006399
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006400#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006402 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006403 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006404 else
6405#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006406 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006407
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006408 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 Py_XDECREF(repr);
6410 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006411 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006412
6413#ifdef NEED_RETRY
6414 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 p += INT_MAX;
6416 size -= INT_MAX;
6417 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006418 }
6419#endif
6420
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006421 return repr;
6422}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006423
Alexander Belopolsky40018472011-02-26 01:02:56 +00006424PyObject *
6425PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006426{
6427 if (!PyUnicode_Check(unicode)) {
6428 PyErr_BadArgument();
6429 return NULL;
6430 }
6431 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 PyUnicode_GET_SIZE(unicode),
6433 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006434}
6435
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006436#undef NEED_RETRY
6437
Victor Stinner99b95382011-07-04 14:23:54 +02006438#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006439
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440/* --- Character Mapping Codec -------------------------------------------- */
6441
Alexander Belopolsky40018472011-02-26 01:02:56 +00006442PyObject *
6443PyUnicode_DecodeCharmap(const char *s,
6444 Py_ssize_t size,
6445 PyObject *mapping,
6446 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006448 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006449 Py_ssize_t startinpos;
6450 Py_ssize_t endinpos;
6451 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 PyUnicodeObject *v;
6454 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006455 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006456 PyObject *errorHandler = NULL;
6457 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006458 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006459 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006460
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 /* Default to Latin-1 */
6462 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464
6465 v = _PyUnicode_New(size);
6466 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006471 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006472 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 mapstring = PyUnicode_AS_UNICODE(mapping);
6474 maplen = PyUnicode_GET_SIZE(mapping);
6475 while (s < e) {
6476 unsigned char ch = *s;
6477 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 if (ch < maplen)
6480 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 if (x == 0xfffe) {
6483 /* undefined mapping */
6484 outpos = p-PyUnicode_AS_UNICODE(v);
6485 startinpos = s-starts;
6486 endinpos = startinpos+1;
6487 if (unicode_decode_call_errorhandler(
6488 errors, &errorHandler,
6489 "charmap", "character maps to <undefined>",
6490 &starts, &e, &startinpos, &endinpos, &exc, &s,
6491 &v, &outpos, &p)) {
6492 goto onError;
6493 }
6494 continue;
6495 }
6496 *p++ = x;
6497 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006498 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006499 }
6500 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 while (s < e) {
6502 unsigned char ch = *s;
6503 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006504
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6506 w = PyLong_FromLong((long)ch);
6507 if (w == NULL)
6508 goto onError;
6509 x = PyObject_GetItem(mapping, w);
6510 Py_DECREF(w);
6511 if (x == NULL) {
6512 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6513 /* No mapping found means: mapping is undefined. */
6514 PyErr_Clear();
6515 x = Py_None;
6516 Py_INCREF(x);
6517 } else
6518 goto onError;
6519 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006520
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 /* Apply mapping */
6522 if (PyLong_Check(x)) {
6523 long value = PyLong_AS_LONG(x);
6524 if (value < 0 || value > 65535) {
6525 PyErr_SetString(PyExc_TypeError,
6526 "character mapping must be in range(65536)");
6527 Py_DECREF(x);
6528 goto onError;
6529 }
6530 *p++ = (Py_UNICODE)value;
6531 }
6532 else if (x == Py_None) {
6533 /* undefined mapping */
6534 outpos = p-PyUnicode_AS_UNICODE(v);
6535 startinpos = s-starts;
6536 endinpos = startinpos+1;
6537 if (unicode_decode_call_errorhandler(
6538 errors, &errorHandler,
6539 "charmap", "character maps to <undefined>",
6540 &starts, &e, &startinpos, &endinpos, &exc, &s,
6541 &v, &outpos, &p)) {
6542 Py_DECREF(x);
6543 goto onError;
6544 }
6545 Py_DECREF(x);
6546 continue;
6547 }
6548 else if (PyUnicode_Check(x)) {
6549 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006550
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 if (targetsize == 1)
6552 /* 1-1 mapping */
6553 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006554
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 else if (targetsize > 1) {
6556 /* 1-n mapping */
6557 if (targetsize > extrachars) {
6558 /* resize first */
6559 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6560 Py_ssize_t needed = (targetsize - extrachars) + \
6561 (targetsize << 2);
6562 extrachars += needed;
6563 /* XXX overflow detection missing */
6564 if (_PyUnicode_Resize(&v,
6565 PyUnicode_GET_SIZE(v) + needed) < 0) {
6566 Py_DECREF(x);
6567 goto onError;
6568 }
6569 p = PyUnicode_AS_UNICODE(v) + oldpos;
6570 }
6571 Py_UNICODE_COPY(p,
6572 PyUnicode_AS_UNICODE(x),
6573 targetsize);
6574 p += targetsize;
6575 extrachars -= targetsize;
6576 }
6577 /* 1-0 mapping: skip the character */
6578 }
6579 else {
6580 /* wrong return value */
6581 PyErr_SetString(PyExc_TypeError,
6582 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006583 Py_DECREF(x);
6584 goto onError;
6585 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 Py_DECREF(x);
6587 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 }
6590 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6592 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006593 Py_XDECREF(errorHandler);
6594 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006595 if (PyUnicode_READY(v) == -1) {
6596 Py_DECREF(v);
6597 return NULL;
6598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006600
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006602 Py_XDECREF(errorHandler);
6603 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 Py_XDECREF(v);
6605 return NULL;
6606}
6607
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006608/* Charmap encoding: the lookup table */
6609
Alexander Belopolsky40018472011-02-26 01:02:56 +00006610struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 PyObject_HEAD
6612 unsigned char level1[32];
6613 int count2, count3;
6614 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006615};
6616
6617static PyObject*
6618encoding_map_size(PyObject *obj, PyObject* args)
6619{
6620 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006621 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006623}
6624
6625static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006626 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 PyDoc_STR("Return the size (in bytes) of this object") },
6628 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006629};
6630
6631static void
6632encoding_map_dealloc(PyObject* o)
6633{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006634 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006635}
6636
6637static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006638 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 "EncodingMap", /*tp_name*/
6640 sizeof(struct encoding_map), /*tp_basicsize*/
6641 0, /*tp_itemsize*/
6642 /* methods */
6643 encoding_map_dealloc, /*tp_dealloc*/
6644 0, /*tp_print*/
6645 0, /*tp_getattr*/
6646 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006647 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006648 0, /*tp_repr*/
6649 0, /*tp_as_number*/
6650 0, /*tp_as_sequence*/
6651 0, /*tp_as_mapping*/
6652 0, /*tp_hash*/
6653 0, /*tp_call*/
6654 0, /*tp_str*/
6655 0, /*tp_getattro*/
6656 0, /*tp_setattro*/
6657 0, /*tp_as_buffer*/
6658 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6659 0, /*tp_doc*/
6660 0, /*tp_traverse*/
6661 0, /*tp_clear*/
6662 0, /*tp_richcompare*/
6663 0, /*tp_weaklistoffset*/
6664 0, /*tp_iter*/
6665 0, /*tp_iternext*/
6666 encoding_map_methods, /*tp_methods*/
6667 0, /*tp_members*/
6668 0, /*tp_getset*/
6669 0, /*tp_base*/
6670 0, /*tp_dict*/
6671 0, /*tp_descr_get*/
6672 0, /*tp_descr_set*/
6673 0, /*tp_dictoffset*/
6674 0, /*tp_init*/
6675 0, /*tp_alloc*/
6676 0, /*tp_new*/
6677 0, /*tp_free*/
6678 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006679};
6680
6681PyObject*
6682PyUnicode_BuildEncodingMap(PyObject* string)
6683{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006684 PyObject *result;
6685 struct encoding_map *mresult;
6686 int i;
6687 int need_dict = 0;
6688 unsigned char level1[32];
6689 unsigned char level2[512];
6690 unsigned char *mlevel1, *mlevel2, *mlevel3;
6691 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006692 int kind;
6693 void *data;
6694 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006696 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006697 PyErr_BadArgument();
6698 return NULL;
6699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006700 kind = PyUnicode_KIND(string);
6701 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006702 memset(level1, 0xFF, sizeof level1);
6703 memset(level2, 0xFF, sizeof level2);
6704
6705 /* If there isn't a one-to-one mapping of NULL to \0,
6706 or if there are non-BMP characters, we need to use
6707 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006708 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006709 need_dict = 1;
6710 for (i = 1; i < 256; i++) {
6711 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006712 ch = PyUnicode_READ(kind, data, i);
6713 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006714 need_dict = 1;
6715 break;
6716 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006717 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006718 /* unmapped character */
6719 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006720 l1 = ch >> 11;
6721 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006722 if (level1[l1] == 0xFF)
6723 level1[l1] = count2++;
6724 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006725 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006726 }
6727
6728 if (count2 >= 0xFF || count3 >= 0xFF)
6729 need_dict = 1;
6730
6731 if (need_dict) {
6732 PyObject *result = PyDict_New();
6733 PyObject *key, *value;
6734 if (!result)
6735 return NULL;
6736 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006737 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006738 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006739 if (!key || !value)
6740 goto failed1;
6741 if (PyDict_SetItem(result, key, value) == -1)
6742 goto failed1;
6743 Py_DECREF(key);
6744 Py_DECREF(value);
6745 }
6746 return result;
6747 failed1:
6748 Py_XDECREF(key);
6749 Py_XDECREF(value);
6750 Py_DECREF(result);
6751 return NULL;
6752 }
6753
6754 /* Create a three-level trie */
6755 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6756 16*count2 + 128*count3 - 1);
6757 if (!result)
6758 return PyErr_NoMemory();
6759 PyObject_Init(result, &EncodingMapType);
6760 mresult = (struct encoding_map*)result;
6761 mresult->count2 = count2;
6762 mresult->count3 = count3;
6763 mlevel1 = mresult->level1;
6764 mlevel2 = mresult->level23;
6765 mlevel3 = mresult->level23 + 16*count2;
6766 memcpy(mlevel1, level1, 32);
6767 memset(mlevel2, 0xFF, 16*count2);
6768 memset(mlevel3, 0, 128*count3);
6769 count3 = 0;
6770 for (i = 1; i < 256; i++) {
6771 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006772 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006773 /* unmapped character */
6774 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006775 o1 = PyUnicode_READ(kind, data, i)>>11;
6776 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006777 i2 = 16*mlevel1[o1] + o2;
6778 if (mlevel2[i2] == 0xFF)
6779 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006780 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006781 i3 = 128*mlevel2[i2] + o3;
6782 mlevel3[i3] = i;
6783 }
6784 return result;
6785}
6786
6787static int
6788encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6789{
6790 struct encoding_map *map = (struct encoding_map*)mapping;
6791 int l1 = c>>11;
6792 int l2 = (c>>7) & 0xF;
6793 int l3 = c & 0x7F;
6794 int i;
6795
6796#ifdef Py_UNICODE_WIDE
6797 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006799 }
6800#endif
6801 if (c == 0)
6802 return 0;
6803 /* level 1*/
6804 i = map->level1[l1];
6805 if (i == 0xFF) {
6806 return -1;
6807 }
6808 /* level 2*/
6809 i = map->level23[16*i+l2];
6810 if (i == 0xFF) {
6811 return -1;
6812 }
6813 /* level 3 */
6814 i = map->level23[16*map->count2 + 128*i + l3];
6815 if (i == 0) {
6816 return -1;
6817 }
6818 return i;
6819}
6820
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006821/* Lookup the character ch in the mapping. If the character
6822 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006823 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006824static PyObject *
6825charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826{
Christian Heimes217cfd12007-12-02 14:31:20 +00006827 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006828 PyObject *x;
6829
6830 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006832 x = PyObject_GetItem(mapping, w);
6833 Py_DECREF(w);
6834 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6836 /* No mapping found means: mapping is undefined. */
6837 PyErr_Clear();
6838 x = Py_None;
6839 Py_INCREF(x);
6840 return x;
6841 } else
6842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006844 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006846 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 long value = PyLong_AS_LONG(x);
6848 if (value < 0 || value > 255) {
6849 PyErr_SetString(PyExc_TypeError,
6850 "character mapping must be in range(256)");
6851 Py_DECREF(x);
6852 return NULL;
6853 }
6854 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006856 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 /* wrong return value */
6860 PyErr_Format(PyExc_TypeError,
6861 "character mapping must return integer, bytes or None, not %.400s",
6862 x->ob_type->tp_name);
6863 Py_DECREF(x);
6864 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 }
6866}
6867
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006868static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006869charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006870{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006871 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6872 /* exponentially overallocate to minimize reallocations */
6873 if (requiredsize < 2*outsize)
6874 requiredsize = 2*outsize;
6875 if (_PyBytes_Resize(outobj, requiredsize))
6876 return -1;
6877 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006878}
6879
Benjamin Peterson14339b62009-01-31 16:36:08 +00006880typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006882} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006883/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006884 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006885 space is available. Return a new reference to the object that
6886 was put in the output buffer, or Py_None, if the mapping was undefined
6887 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006888 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006889static charmapencode_result
6890charmapencode_output(Py_UNICODE c, PyObject *mapping,
6891 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006892{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006893 PyObject *rep;
6894 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006895 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896
Christian Heimes90aa7642007-12-19 02:45:37 +00006897 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006898 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006900 if (res == -1)
6901 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 if (outsize<requiredsize)
6903 if (charmapencode_resize(outobj, outpos, requiredsize))
6904 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006905 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 outstart[(*outpos)++] = (char)res;
6907 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006908 }
6909
6910 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006911 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006912 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006913 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 Py_DECREF(rep);
6915 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006916 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 if (PyLong_Check(rep)) {
6918 Py_ssize_t requiredsize = *outpos+1;
6919 if (outsize<requiredsize)
6920 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6921 Py_DECREF(rep);
6922 return enc_EXCEPTION;
6923 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006924 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006926 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 else {
6928 const char *repchars = PyBytes_AS_STRING(rep);
6929 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6930 Py_ssize_t requiredsize = *outpos+repsize;
6931 if (outsize<requiredsize)
6932 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6933 Py_DECREF(rep);
6934 return enc_EXCEPTION;
6935 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006936 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 memcpy(outstart + *outpos, repchars, repsize);
6938 *outpos += repsize;
6939 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006940 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006941 Py_DECREF(rep);
6942 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006943}
6944
6945/* handle an error in PyUnicode_EncodeCharmap
6946 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006947static int
6948charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006949 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006950 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006951 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006952 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006953{
6954 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006955 Py_ssize_t repsize;
6956 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006957 Py_UNICODE *uni2;
6958 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006959 Py_ssize_t collstartpos = *inpos;
6960 Py_ssize_t collendpos = *inpos+1;
6961 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006962 char *encoding = "charmap";
6963 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006964 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006965
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006966 /* find all unencodable characters */
6967 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006968 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006969 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 int res = encoding_map_lookup(p[collendpos], mapping);
6971 if (res != -1)
6972 break;
6973 ++collendpos;
6974 continue;
6975 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006976
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 rep = charmapencode_lookup(p[collendpos], mapping);
6978 if (rep==NULL)
6979 return -1;
6980 else if (rep!=Py_None) {
6981 Py_DECREF(rep);
6982 break;
6983 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006984 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006986 }
6987 /* cache callback name lookup
6988 * (if not done yet, i.e. it's the first error) */
6989 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006990 if ((errors==NULL) || (!strcmp(errors, "strict")))
6991 *known_errorHandler = 1;
6992 else if (!strcmp(errors, "replace"))
6993 *known_errorHandler = 2;
6994 else if (!strcmp(errors, "ignore"))
6995 *known_errorHandler = 3;
6996 else if (!strcmp(errors, "xmlcharrefreplace"))
6997 *known_errorHandler = 4;
6998 else
6999 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007000 }
7001 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007002 case 1: /* strict */
7003 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7004 return -1;
7005 case 2: /* replace */
7006 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 x = charmapencode_output('?', mapping, res, respos);
7008 if (x==enc_EXCEPTION) {
7009 return -1;
7010 }
7011 else if (x==enc_FAILED) {
7012 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7013 return -1;
7014 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007015 }
7016 /* fall through */
7017 case 3: /* ignore */
7018 *inpos = collendpos;
7019 break;
7020 case 4: /* xmlcharrefreplace */
7021 /* generate replacement (temporarily (mis)uses p) */
7022 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 char buffer[2+29+1+1];
7024 char *cp;
7025 sprintf(buffer, "&#%d;", (int)p[collpos]);
7026 for (cp = buffer; *cp; ++cp) {
7027 x = charmapencode_output(*cp, mapping, res, respos);
7028 if (x==enc_EXCEPTION)
7029 return -1;
7030 else if (x==enc_FAILED) {
7031 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7032 return -1;
7033 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007034 }
7035 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007036 *inpos = collendpos;
7037 break;
7038 default:
7039 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 encoding, reason, p, size, exceptionObject,
7041 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007042 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007044 if (PyBytes_Check(repunicode)) {
7045 /* Directly copy bytes result to output. */
7046 Py_ssize_t outsize = PyBytes_Size(*res);
7047 Py_ssize_t requiredsize;
7048 repsize = PyBytes_Size(repunicode);
7049 requiredsize = *respos + repsize;
7050 if (requiredsize > outsize)
7051 /* Make room for all additional bytes. */
7052 if (charmapencode_resize(res, respos, requiredsize)) {
7053 Py_DECREF(repunicode);
7054 return -1;
7055 }
7056 memcpy(PyBytes_AsString(*res) + *respos,
7057 PyBytes_AsString(repunicode), repsize);
7058 *respos += repsize;
7059 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007060 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007061 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007062 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007063 /* generate replacement */
7064 repsize = PyUnicode_GET_SIZE(repunicode);
7065 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 x = charmapencode_output(*uni2, mapping, res, respos);
7067 if (x==enc_EXCEPTION) {
7068 return -1;
7069 }
7070 else if (x==enc_FAILED) {
7071 Py_DECREF(repunicode);
7072 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7073 return -1;
7074 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007075 }
7076 *inpos = newpos;
7077 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007078 }
7079 return 0;
7080}
7081
Alexander Belopolsky40018472011-02-26 01:02:56 +00007082PyObject *
7083PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7084 Py_ssize_t size,
7085 PyObject *mapping,
7086 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007088 /* output object */
7089 PyObject *res = NULL;
7090 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007091 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007092 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007093 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007094 PyObject *errorHandler = NULL;
7095 PyObject *exc = NULL;
7096 /* the following variable is used for caching string comparisons
7097 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7098 * 3=ignore, 4=xmlcharrefreplace */
7099 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100
7101 /* Default to Latin-1 */
7102 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007103 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007105 /* allocate enough for a simple encoding without
7106 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007107 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007108 if (res == NULL)
7109 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007110 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007111 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007113 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 /* try to encode it */
7115 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7116 if (x==enc_EXCEPTION) /* error */
7117 goto onError;
7118 if (x==enc_FAILED) { /* unencodable character */
7119 if (charmap_encoding_error(p, size, &inpos, mapping,
7120 &exc,
7121 &known_errorHandler, &errorHandler, errors,
7122 &res, &respos)) {
7123 goto onError;
7124 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007125 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 else
7127 /* done with this character => adjust input position */
7128 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007131 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007132 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007133 if (_PyBytes_Resize(&res, respos) < 0)
7134 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007135
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007136 Py_XDECREF(exc);
7137 Py_XDECREF(errorHandler);
7138 return res;
7139
Benjamin Peterson29060642009-01-31 22:14:21 +00007140 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007141 Py_XDECREF(res);
7142 Py_XDECREF(exc);
7143 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144 return NULL;
7145}
7146
Alexander Belopolsky40018472011-02-26 01:02:56 +00007147PyObject *
7148PyUnicode_AsCharmapString(PyObject *unicode,
7149 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150{
7151 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 PyErr_BadArgument();
7153 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154 }
7155 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007156 PyUnicode_GET_SIZE(unicode),
7157 mapping,
7158 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159}
7160
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007161/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007162static void
7163make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007164 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007165 Py_ssize_t startpos, Py_ssize_t endpos,
7166 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007168 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007169 *exceptionObject = _PyUnicodeTranslateError_Create(
7170 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171 }
7172 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007173 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7174 goto onError;
7175 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7176 goto onError;
7177 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7178 goto onError;
7179 return;
7180 onError:
7181 Py_DECREF(*exceptionObject);
7182 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 }
7184}
7185
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007186/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007187static void
7188raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007189 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007190 Py_ssize_t startpos, Py_ssize_t endpos,
7191 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007192{
7193 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007194 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007195 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007197}
7198
7199/* error handling callback helper:
7200 build arguments, call the callback and check the arguments,
7201 put the result into newpos and return the replacement string, which
7202 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007203static PyObject *
7204unicode_translate_call_errorhandler(const char *errors,
7205 PyObject **errorHandler,
7206 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007207 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007208 Py_ssize_t startpos, Py_ssize_t endpos,
7209 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007210{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007211 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007212
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007213 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007214 PyObject *restuple;
7215 PyObject *resunicode;
7216
7217 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007219 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007221 }
7222
7223 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007224 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007225 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007227
7228 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007230 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007232 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007233 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 Py_DECREF(restuple);
7235 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007236 }
7237 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 &resunicode, &i_newpos)) {
7239 Py_DECREF(restuple);
7240 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007241 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007242 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007243 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007244 else
7245 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007246 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7248 Py_DECREF(restuple);
7249 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007250 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007251 Py_INCREF(resunicode);
7252 Py_DECREF(restuple);
7253 return resunicode;
7254}
7255
7256/* Lookup the character ch in the mapping and put the result in result,
7257 which must be decrefed by the caller.
7258 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007259static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007260charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007261{
Christian Heimes217cfd12007-12-02 14:31:20 +00007262 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007263 PyObject *x;
7264
7265 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007267 x = PyObject_GetItem(mapping, w);
7268 Py_DECREF(w);
7269 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7271 /* No mapping found means: use 1:1 mapping. */
7272 PyErr_Clear();
7273 *result = NULL;
7274 return 0;
7275 } else
7276 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007277 }
7278 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 *result = x;
7280 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007281 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007282 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007283 long value = PyLong_AS_LONG(x);
7284 long max = PyUnicode_GetMax();
7285 if (value < 0 || value > max) {
7286 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007287 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007288 Py_DECREF(x);
7289 return -1;
7290 }
7291 *result = x;
7292 return 0;
7293 }
7294 else if (PyUnicode_Check(x)) {
7295 *result = x;
7296 return 0;
7297 }
7298 else {
7299 /* wrong return value */
7300 PyErr_SetString(PyExc_TypeError,
7301 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007302 Py_DECREF(x);
7303 return -1;
7304 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007305}
7306/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 if not reallocate and adjust various state variables.
7308 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007309static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007310charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007312{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007313 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007314 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 /* exponentially overallocate to minimize reallocations */
7316 if (requiredsize < 2 * oldsize)
7317 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007318 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7319 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007321 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007322 }
7323 return 0;
7324}
7325/* lookup the character, put the result in the output string and adjust
7326 various state variables. Return a new reference to the object that
7327 was put in the output buffer in *result, or Py_None, if the mapping was
7328 undefined (in which case no character was written).
7329 The called must decref result.
7330 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007331static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007332charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7333 PyObject *mapping, Py_UCS4 **output,
7334 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007335 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007337 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7338 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007340 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007342 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007343 }
7344 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007346 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007348 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007349 }
7350 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007351 Py_ssize_t repsize;
7352 if (PyUnicode_READY(*res) == -1)
7353 return -1;
7354 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 if (repsize==1) {
7356 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007357 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 }
7359 else if (repsize!=0) {
7360 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007361 Py_ssize_t requiredsize = *opos +
7362 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007364 Py_ssize_t i;
7365 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007367 for(i = 0; i < repsize; i++)
7368 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007370 }
7371 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007372 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007373 return 0;
7374}
7375
Alexander Belopolsky40018472011-02-26 01:02:56 +00007376PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007377_PyUnicode_TranslateCharmap(PyObject *input,
7378 PyObject *mapping,
7379 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007381 /* input object */
7382 char *idata;
7383 Py_ssize_t size, i;
7384 int kind;
7385 /* output buffer */
7386 Py_UCS4 *output = NULL;
7387 Py_ssize_t osize;
7388 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007389 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007390 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007391 char *reason = "character maps to <undefined>";
7392 PyObject *errorHandler = NULL;
7393 PyObject *exc = NULL;
7394 /* the following variable is used for caching string comparisons
7395 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7396 * 3=ignore, 4=xmlcharrefreplace */
7397 int known_errorHandler = -1;
7398
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 PyErr_BadArgument();
7401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007404 if (PyUnicode_READY(input) == -1)
7405 return NULL;
7406 idata = (char*)PyUnicode_DATA(input);
7407 kind = PyUnicode_KIND(input);
7408 size = PyUnicode_GET_LENGTH(input);
7409 i = 0;
7410
7411 if (size == 0) {
7412 Py_INCREF(input);
7413 return input;
7414 }
7415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007416 /* allocate enough for a simple 1:1 translation without
7417 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007418 osize = size;
7419 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7420 opos = 0;
7421 if (output == NULL) {
7422 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007426 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 /* try to encode it */
7428 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007429 if (charmaptranslate_output(input, i, mapping,
7430 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 Py_XDECREF(x);
7432 goto onError;
7433 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007434 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007436 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 else { /* untranslatable character */
7438 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7439 Py_ssize_t repsize;
7440 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007441 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007443 Py_ssize_t collstart = i;
7444 Py_ssize_t collend = i+1;
7445 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007448 while (collend < size) {
7449 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 goto onError;
7451 Py_XDECREF(x);
7452 if (x!=Py_None)
7453 break;
7454 ++collend;
7455 }
7456 /* cache callback name lookup
7457 * (if not done yet, i.e. it's the first error) */
7458 if (known_errorHandler==-1) {
7459 if ((errors==NULL) || (!strcmp(errors, "strict")))
7460 known_errorHandler = 1;
7461 else if (!strcmp(errors, "replace"))
7462 known_errorHandler = 2;
7463 else if (!strcmp(errors, "ignore"))
7464 known_errorHandler = 3;
7465 else if (!strcmp(errors, "xmlcharrefreplace"))
7466 known_errorHandler = 4;
7467 else
7468 known_errorHandler = 0;
7469 }
7470 switch (known_errorHandler) {
7471 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007472 raise_translate_exception(&exc, input, collstart,
7473 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007474 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 case 2: /* replace */
7476 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007477 for (coll = collstart; coll<collend; coll++)
7478 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 /* fall through */
7480 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007481 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007482 break;
7483 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007484 /* generate replacement (temporarily (mis)uses i) */
7485 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 char buffer[2+29+1+1];
7487 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007488 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7489 if (charmaptranslate_makespace(&output, &osize,
7490 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 goto onError;
7492 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007493 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007495 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 break;
7497 default:
7498 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007499 reason, input, &exc,
7500 collstart, collend, &newpos);
7501 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 goto onError;
7503 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007504 repsize = PyUnicode_GET_LENGTH(repunicode);
7505 if (charmaptranslate_makespace(&output, &osize,
7506 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 Py_DECREF(repunicode);
7508 goto onError;
7509 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007510 for (uni2 = 0; repsize-->0; ++uni2)
7511 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7512 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007513 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007514 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007515 }
7516 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007517 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7518 if (!res)
7519 goto onError;
7520 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007521 Py_XDECREF(exc);
7522 Py_XDECREF(errorHandler);
7523 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007526 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007527 Py_XDECREF(exc);
7528 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 return NULL;
7530}
7531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007532/* Deprecated. Use PyUnicode_Translate instead. */
7533PyObject *
7534PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7535 Py_ssize_t size,
7536 PyObject *mapping,
7537 const char *errors)
7538{
7539 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7540 if (!unicode)
7541 return NULL;
7542 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7543}
7544
Alexander Belopolsky40018472011-02-26 01:02:56 +00007545PyObject *
7546PyUnicode_Translate(PyObject *str,
7547 PyObject *mapping,
7548 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549{
7550 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007551
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 str = PyUnicode_FromObject(str);
7553 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007554 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007555 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 Py_DECREF(str);
7557 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007558
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560 Py_XDECREF(str);
7561 return NULL;
7562}
Tim Petersced69f82003-09-16 20:30:58 +00007563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007564static Py_UCS4
7565fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7566{
7567 /* No need to call PyUnicode_READY(self) because this function is only
7568 called as a callback from fixup() which does it already. */
7569 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7570 const int kind = PyUnicode_KIND(self);
7571 void *data = PyUnicode_DATA(self);
7572 Py_UCS4 maxchar = 0, ch, fixed;
7573 Py_ssize_t i;
7574
7575 for (i = 0; i < len; ++i) {
7576 ch = PyUnicode_READ(kind, data, i);
7577 fixed = 0;
7578 if (ch > 127) {
7579 if (Py_UNICODE_ISSPACE(ch))
7580 fixed = ' ';
7581 else {
7582 const int decimal = Py_UNICODE_TODECIMAL(ch);
7583 if (decimal >= 0)
7584 fixed = '0' + decimal;
7585 }
7586 if (fixed != 0) {
7587 if (fixed > maxchar)
7588 maxchar = fixed;
7589 PyUnicode_WRITE(kind, data, i, fixed);
7590 }
7591 else if (ch > maxchar)
7592 maxchar = ch;
7593 }
7594 else if (ch > maxchar)
7595 maxchar = ch;
7596 }
7597
7598 return maxchar;
7599}
7600
7601PyObject *
7602_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7603{
7604 if (!PyUnicode_Check(unicode)) {
7605 PyErr_BadInternalCall();
7606 return NULL;
7607 }
7608 if (PyUnicode_READY(unicode) == -1)
7609 return NULL;
7610 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7611 /* If the string is already ASCII, just return the same string */
7612 Py_INCREF(unicode);
7613 return unicode;
7614 }
7615 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7616}
7617
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007618PyObject *
7619PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7620 Py_ssize_t length)
7621{
7622 PyObject *result;
7623 Py_UNICODE *p; /* write pointer into result */
7624 Py_ssize_t i;
7625 /* Copy to a new string */
7626 result = (PyObject *)_PyUnicode_New(length);
7627 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7628 if (result == NULL)
7629 return result;
7630 p = PyUnicode_AS_UNICODE(result);
7631 /* Iterate over code points */
7632 for (i = 0; i < length; i++) {
7633 Py_UNICODE ch =s[i];
7634 if (ch > 127) {
7635 int decimal = Py_UNICODE_TODECIMAL(ch);
7636 if (decimal >= 0)
7637 p[i] = '0' + decimal;
7638 }
7639 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007640 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7641 Py_DECREF(result);
7642 return NULL;
7643 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007644 return result;
7645}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007646/* --- Decimal Encoder ---------------------------------------------------- */
7647
Alexander Belopolsky40018472011-02-26 01:02:56 +00007648int
7649PyUnicode_EncodeDecimal(Py_UNICODE *s,
7650 Py_ssize_t length,
7651 char *output,
7652 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007653{
7654 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007655 PyObject *errorHandler = NULL;
7656 PyObject *exc = NULL;
7657 const char *encoding = "decimal";
7658 const char *reason = "invalid decimal Unicode string";
7659 /* the following variable is used for caching string comparisons
7660 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7661 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007662
7663 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 PyErr_BadArgument();
7665 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007666 }
7667
7668 p = s;
7669 end = s + length;
7670 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 register Py_UNICODE ch = *p;
7672 int decimal;
7673 PyObject *repunicode;
7674 Py_ssize_t repsize;
7675 Py_ssize_t newpos;
7676 Py_UNICODE *uni2;
7677 Py_UNICODE *collstart;
7678 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007679
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007681 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 ++p;
7683 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007684 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 decimal = Py_UNICODE_TODECIMAL(ch);
7686 if (decimal >= 0) {
7687 *output++ = '0' + decimal;
7688 ++p;
7689 continue;
7690 }
7691 if (0 < ch && ch < 256) {
7692 *output++ = (char)ch;
7693 ++p;
7694 continue;
7695 }
7696 /* All other characters are considered unencodable */
7697 collstart = p;
7698 collend = p+1;
7699 while (collend < end) {
7700 if ((0 < *collend && *collend < 256) ||
7701 !Py_UNICODE_ISSPACE(*collend) ||
7702 Py_UNICODE_TODECIMAL(*collend))
7703 break;
7704 }
7705 /* cache callback name lookup
7706 * (if not done yet, i.e. it's the first error) */
7707 if (known_errorHandler==-1) {
7708 if ((errors==NULL) || (!strcmp(errors, "strict")))
7709 known_errorHandler = 1;
7710 else if (!strcmp(errors, "replace"))
7711 known_errorHandler = 2;
7712 else if (!strcmp(errors, "ignore"))
7713 known_errorHandler = 3;
7714 else if (!strcmp(errors, "xmlcharrefreplace"))
7715 known_errorHandler = 4;
7716 else
7717 known_errorHandler = 0;
7718 }
7719 switch (known_errorHandler) {
7720 case 1: /* strict */
7721 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7722 goto onError;
7723 case 2: /* replace */
7724 for (p = collstart; p < collend; ++p)
7725 *output++ = '?';
7726 /* fall through */
7727 case 3: /* ignore */
7728 p = collend;
7729 break;
7730 case 4: /* xmlcharrefreplace */
7731 /* generate replacement (temporarily (mis)uses p) */
7732 for (p = collstart; p < collend; ++p)
7733 output += sprintf(output, "&#%d;", (int)*p);
7734 p = collend;
7735 break;
7736 default:
7737 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7738 encoding, reason, s, length, &exc,
7739 collstart-s, collend-s, &newpos);
7740 if (repunicode == NULL)
7741 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007742 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007743 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007744 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7745 Py_DECREF(repunicode);
7746 goto onError;
7747 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 /* generate replacement */
7749 repsize = PyUnicode_GET_SIZE(repunicode);
7750 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7751 Py_UNICODE ch = *uni2;
7752 if (Py_UNICODE_ISSPACE(ch))
7753 *output++ = ' ';
7754 else {
7755 decimal = Py_UNICODE_TODECIMAL(ch);
7756 if (decimal >= 0)
7757 *output++ = '0' + decimal;
7758 else if (0 < ch && ch < 256)
7759 *output++ = (char)ch;
7760 else {
7761 Py_DECREF(repunicode);
7762 raise_encode_exception(&exc, encoding,
7763 s, length, collstart-s, collend-s, reason);
7764 goto onError;
7765 }
7766 }
7767 }
7768 p = s + newpos;
7769 Py_DECREF(repunicode);
7770 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007771 }
7772 /* 0-terminate the output string */
7773 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007774 Py_XDECREF(exc);
7775 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007776 return 0;
7777
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007779 Py_XDECREF(exc);
7780 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007781 return -1;
7782}
7783
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784/* --- Helpers ------------------------------------------------------------ */
7785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007786#include "stringlib/ucs1lib.h"
7787#include "stringlib/fastsearch.h"
7788#include "stringlib/partition.h"
7789#include "stringlib/split.h"
7790#include "stringlib/count.h"
7791#include "stringlib/find.h"
7792#include "stringlib/localeutil.h"
7793#include "stringlib/undef.h"
7794
7795#include "stringlib/ucs2lib.h"
7796#include "stringlib/fastsearch.h"
7797#include "stringlib/partition.h"
7798#include "stringlib/split.h"
7799#include "stringlib/count.h"
7800#include "stringlib/find.h"
7801#include "stringlib/localeutil.h"
7802#include "stringlib/undef.h"
7803
7804#include "stringlib/ucs4lib.h"
7805#include "stringlib/fastsearch.h"
7806#include "stringlib/partition.h"
7807#include "stringlib/split.h"
7808#include "stringlib/count.h"
7809#include "stringlib/find.h"
7810#include "stringlib/localeutil.h"
7811#include "stringlib/undef.h"
7812
7813static Py_ssize_t
7814any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7815 const Py_UCS1*, Py_ssize_t,
7816 Py_ssize_t, Py_ssize_t),
7817 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7818 const Py_UCS2*, Py_ssize_t,
7819 Py_ssize_t, Py_ssize_t),
7820 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7821 const Py_UCS4*, Py_ssize_t,
7822 Py_ssize_t, Py_ssize_t),
7823 PyObject* s1, PyObject* s2,
7824 Py_ssize_t start,
7825 Py_ssize_t end)
7826{
7827 int kind1, kind2, kind;
7828 void *buf1, *buf2;
7829 Py_ssize_t len1, len2, result;
7830
7831 kind1 = PyUnicode_KIND(s1);
7832 kind2 = PyUnicode_KIND(s2);
7833 kind = kind1 > kind2 ? kind1 : kind2;
7834 buf1 = PyUnicode_DATA(s1);
7835 buf2 = PyUnicode_DATA(s2);
7836 if (kind1 != kind)
7837 buf1 = _PyUnicode_AsKind(s1, kind);
7838 if (!buf1)
7839 return -2;
7840 if (kind2 != kind)
7841 buf2 = _PyUnicode_AsKind(s2, kind);
7842 if (!buf2) {
7843 if (kind1 != kind) PyMem_Free(buf1);
7844 return -2;
7845 }
7846 len1 = PyUnicode_GET_LENGTH(s1);
7847 len2 = PyUnicode_GET_LENGTH(s2);
7848
7849 switch(kind) {
7850 case PyUnicode_1BYTE_KIND:
7851 result = ucs1(buf1, len1, buf2, len2, start, end);
7852 break;
7853 case PyUnicode_2BYTE_KIND:
7854 result = ucs2(buf1, len1, buf2, len2, start, end);
7855 break;
7856 case PyUnicode_4BYTE_KIND:
7857 result = ucs4(buf1, len1, buf2, len2, start, end);
7858 break;
7859 default:
7860 assert(0); result = -2;
7861 }
7862
7863 if (kind1 != kind)
7864 PyMem_Free(buf1);
7865 if (kind2 != kind)
7866 PyMem_Free(buf2);
7867
7868 return result;
7869}
7870
7871Py_ssize_t
7872_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7873 Py_ssize_t n_buffer,
7874 void *digits, Py_ssize_t n_digits,
7875 Py_ssize_t min_width,
7876 const char *grouping,
7877 const char *thousands_sep)
7878{
7879 switch(kind) {
7880 case PyUnicode_1BYTE_KIND:
7881 return _PyUnicode_ucs1_InsertThousandsGrouping(
7882 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7883 min_width, grouping, thousands_sep);
7884 case PyUnicode_2BYTE_KIND:
7885 return _PyUnicode_ucs2_InsertThousandsGrouping(
7886 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7887 min_width, grouping, thousands_sep);
7888 case PyUnicode_4BYTE_KIND:
7889 return _PyUnicode_ucs4_InsertThousandsGrouping(
7890 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7891 min_width, grouping, thousands_sep);
7892 }
7893 assert(0);
7894 return -1;
7895}
7896
7897
Eric Smith8c663262007-08-25 02:26:07 +00007898#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007899#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007900
Thomas Wouters477c8d52006-05-27 19:21:47 +00007901#include "stringlib/count.h"
7902#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007903
Thomas Wouters477c8d52006-05-27 19:21:47 +00007904/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007905#define ADJUST_INDICES(start, end, len) \
7906 if (end > len) \
7907 end = len; \
7908 else if (end < 0) { \
7909 end += len; \
7910 if (end < 0) \
7911 end = 0; \
7912 } \
7913 if (start < 0) { \
7914 start += len; \
7915 if (start < 0) \
7916 start = 0; \
7917 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007918
Alexander Belopolsky40018472011-02-26 01:02:56 +00007919Py_ssize_t
7920PyUnicode_Count(PyObject *str,
7921 PyObject *substr,
7922 Py_ssize_t start,
7923 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007925 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007926 PyUnicodeObject* str_obj;
7927 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007928 int kind1, kind2, kind;
7929 void *buf1 = NULL, *buf2 = NULL;
7930 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007931
Thomas Wouters477c8d52006-05-27 19:21:47 +00007932 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007933 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007935 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007936 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 Py_DECREF(str_obj);
7938 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939 }
Tim Petersced69f82003-09-16 20:30:58 +00007940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007941 kind1 = PyUnicode_KIND(str_obj);
7942 kind2 = PyUnicode_KIND(sub_obj);
7943 kind = kind1 > kind2 ? kind1 : kind2;
7944 buf1 = PyUnicode_DATA(str_obj);
7945 if (kind1 != kind)
7946 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7947 if (!buf1)
7948 goto onError;
7949 buf2 = PyUnicode_DATA(sub_obj);
7950 if (kind2 != kind)
7951 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7952 if (!buf2)
7953 goto onError;
7954 len1 = PyUnicode_GET_LENGTH(str_obj);
7955 len2 = PyUnicode_GET_LENGTH(sub_obj);
7956
7957 ADJUST_INDICES(start, end, len1);
7958 switch(kind) {
7959 case PyUnicode_1BYTE_KIND:
7960 result = ucs1lib_count(
7961 ((Py_UCS1*)buf1) + start, end - start,
7962 buf2, len2, PY_SSIZE_T_MAX
7963 );
7964 break;
7965 case PyUnicode_2BYTE_KIND:
7966 result = ucs2lib_count(
7967 ((Py_UCS2*)buf1) + start, end - start,
7968 buf2, len2, PY_SSIZE_T_MAX
7969 );
7970 break;
7971 case PyUnicode_4BYTE_KIND:
7972 result = ucs4lib_count(
7973 ((Py_UCS4*)buf1) + start, end - start,
7974 buf2, len2, PY_SSIZE_T_MAX
7975 );
7976 break;
7977 default:
7978 assert(0); result = 0;
7979 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007980
7981 Py_DECREF(sub_obj);
7982 Py_DECREF(str_obj);
7983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007984 if (kind1 != kind)
7985 PyMem_Free(buf1);
7986 if (kind2 != kind)
7987 PyMem_Free(buf2);
7988
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007990 onError:
7991 Py_DECREF(sub_obj);
7992 Py_DECREF(str_obj);
7993 if (kind1 != kind && buf1)
7994 PyMem_Free(buf1);
7995 if (kind2 != kind && buf2)
7996 PyMem_Free(buf2);
7997 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998}
7999
Alexander Belopolsky40018472011-02-26 01:02:56 +00008000Py_ssize_t
8001PyUnicode_Find(PyObject *str,
8002 PyObject *sub,
8003 Py_ssize_t start,
8004 Py_ssize_t end,
8005 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008007 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008008
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008010 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008012 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008013 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 Py_DECREF(str);
8015 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 }
Tim Petersced69f82003-09-16 20:30:58 +00008017
Thomas Wouters477c8d52006-05-27 19:21:47 +00008018 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008019 result = any_find_slice(
8020 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8021 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008022 );
8023 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008024 result = any_find_slice(
8025 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8026 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008027 );
8028
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008030 Py_DECREF(sub);
8031
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 return result;
8033}
8034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008035Py_ssize_t
8036PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8037 Py_ssize_t start, Py_ssize_t end,
8038 int direction)
8039{
8040 char *result;
8041 int kind;
8042 if (PyUnicode_READY(str) == -1)
8043 return -2;
8044 if (end > PyUnicode_GET_LENGTH(str))
8045 end = PyUnicode_GET_LENGTH(str);
8046 kind = PyUnicode_KIND(str);
8047 result = findchar(PyUnicode_1BYTE_DATA(str)
8048 + PyUnicode_KIND_SIZE(kind, start),
8049 kind,
8050 end-start, ch, direction);
8051 if (!result)
8052 return -1;
8053 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8054}
8055
Alexander Belopolsky40018472011-02-26 01:02:56 +00008056static int
8057tailmatch(PyUnicodeObject *self,
8058 PyUnicodeObject *substring,
8059 Py_ssize_t start,
8060 Py_ssize_t end,
8061 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008063 int kind_self;
8064 int kind_sub;
8065 void *data_self;
8066 void *data_sub;
8067 Py_ssize_t offset;
8068 Py_ssize_t i;
8069 Py_ssize_t end_sub;
8070
8071 if (PyUnicode_READY(self) == -1 ||
8072 PyUnicode_READY(substring) == -1)
8073 return 0;
8074
8075 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 return 1;
8077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008078 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8079 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008083 kind_self = PyUnicode_KIND(self);
8084 data_self = PyUnicode_DATA(self);
8085 kind_sub = PyUnicode_KIND(substring);
8086 data_sub = PyUnicode_DATA(substring);
8087 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8088
8089 if (direction > 0)
8090 offset = end;
8091 else
8092 offset = start;
8093
8094 if (PyUnicode_READ(kind_self, data_self, offset) ==
8095 PyUnicode_READ(kind_sub, data_sub, 0) &&
8096 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8097 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8098 /* If both are of the same kind, memcmp is sufficient */
8099 if (kind_self == kind_sub) {
8100 return ! memcmp((char *)data_self +
8101 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8102 data_sub,
8103 PyUnicode_GET_LENGTH(substring) *
8104 PyUnicode_CHARACTER_SIZE(substring));
8105 }
8106 /* otherwise we have to compare each character by first accesing it */
8107 else {
8108 /* We do not need to compare 0 and len(substring)-1 because
8109 the if statement above ensured already that they are equal
8110 when we end up here. */
8111 // TODO: honor direction and do a forward or backwards search
8112 for (i = 1; i < end_sub; ++i) {
8113 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8114 PyUnicode_READ(kind_sub, data_sub, i))
8115 return 0;
8116 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008118 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 }
8120
8121 return 0;
8122}
8123
Alexander Belopolsky40018472011-02-26 01:02:56 +00008124Py_ssize_t
8125PyUnicode_Tailmatch(PyObject *str,
8126 PyObject *substr,
8127 Py_ssize_t start,
8128 Py_ssize_t end,
8129 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008131 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008132
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133 str = PyUnicode_FromObject(str);
8134 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136 substr = PyUnicode_FromObject(substr);
8137 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 Py_DECREF(str);
8139 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 }
Tim Petersced69f82003-09-16 20:30:58 +00008141
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 (PyUnicodeObject *)substr,
8144 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 Py_DECREF(str);
8146 Py_DECREF(substr);
8147 return result;
8148}
8149
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150/* Apply fixfct filter to the Unicode object self and return a
8151 reference to the modified object */
8152
Alexander Belopolsky40018472011-02-26 01:02:56 +00008153static PyObject *
8154fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008155 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008157 PyObject *u;
8158 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008160 if (PyUnicode_READY(self) == -1)
8161 return NULL;
8162 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8163 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8164 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008168 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8169 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008171 /* fix functions return the new maximum character in a string,
8172 if the kind of the resulting unicode object does not change,
8173 everything is fine. Otherwise we need to change the string kind
8174 and re-run the fix function. */
8175 maxchar_new = fixfct((PyUnicodeObject*)u);
8176 if (maxchar_new == 0)
8177 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8178 else if (maxchar_new <= 127)
8179 maxchar_new = 127;
8180 else if (maxchar_new <= 255)
8181 maxchar_new = 255;
8182 else if (maxchar_new <= 65535)
8183 maxchar_new = 65535;
8184 else
8185 maxchar_new = 1114111; /* 0x10ffff */
8186
8187 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 /* fixfct should return TRUE if it modified the buffer. If
8189 FALSE, return a reference to the original buffer instead
8190 (to save space, not time) */
8191 Py_INCREF(self);
8192 Py_DECREF(u);
8193 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008195 else if (maxchar_new == maxchar_old) {
8196 return u;
8197 }
8198 else {
8199 /* In case the maximum character changed, we need to
8200 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008201 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008202 if (v == NULL) {
8203 Py_DECREF(u);
8204 return NULL;
8205 }
8206 if (maxchar_new > maxchar_old) {
8207 /* If the maxchar increased so that the kind changed, not all
8208 characters are representable anymore and we need to fix the
8209 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008210 if (PyUnicode_CopyCharacters(v, 0,
8211 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008212 PyUnicode_GET_LENGTH(self)) < 0)
8213 {
8214 Py_DECREF(u);
8215 return NULL;
8216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008217 maxchar_old = fixfct((PyUnicodeObject*)v);
8218 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8219 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008220 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008221 if (PyUnicode_CopyCharacters(v, 0,
8222 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008223 PyUnicode_GET_LENGTH(self)) < 0)
8224 {
8225 Py_DECREF(u);
8226 return NULL;
8227 }
8228 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008229
8230 Py_DECREF(u);
8231 return v;
8232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233}
8234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008236fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238 /* No need to call PyUnicode_READY(self) because this function is only
8239 called as a callback from fixup() which does it already. */
8240 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8241 const int kind = PyUnicode_KIND(self);
8242 void *data = PyUnicode_DATA(self);
8243 int touched = 0;
8244 Py_UCS4 maxchar = 0;
8245 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008247 for (i = 0; i < len; ++i) {
8248 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8249 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8250 if (up != ch) {
8251 if (up > maxchar)
8252 maxchar = up;
8253 PyUnicode_WRITE(kind, data, i, up);
8254 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008256 else if (ch > maxchar)
8257 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 }
8259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008260 if (touched)
8261 return maxchar;
8262 else
8263 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264}
8265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008266static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008267fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008269 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8270 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8271 const int kind = PyUnicode_KIND(self);
8272 void *data = PyUnicode_DATA(self);
8273 int touched = 0;
8274 Py_UCS4 maxchar = 0;
8275 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008277 for(i = 0; i < len; ++i) {
8278 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8279 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8280 if (lo != ch) {
8281 if (lo > maxchar)
8282 maxchar = lo;
8283 PyUnicode_WRITE(kind, data, i, lo);
8284 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008286 else if (ch > maxchar)
8287 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288 }
8289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008290 if (touched)
8291 return maxchar;
8292 else
8293 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294}
8295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008296static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008297fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008299 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8300 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8301 const int kind = PyUnicode_KIND(self);
8302 void *data = PyUnicode_DATA(self);
8303 int touched = 0;
8304 Py_UCS4 maxchar = 0;
8305 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008307 for(i = 0; i < len; ++i) {
8308 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8309 Py_UCS4 nu = 0;
8310
8311 if (Py_UNICODE_ISUPPER(ch))
8312 nu = Py_UNICODE_TOLOWER(ch);
8313 else if (Py_UNICODE_ISLOWER(ch))
8314 nu = Py_UNICODE_TOUPPER(ch);
8315
8316 if (nu != 0) {
8317 if (nu > maxchar)
8318 maxchar = nu;
8319 PyUnicode_WRITE(kind, data, i, nu);
8320 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008322 else if (ch > maxchar)
8323 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 }
8325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008326 if (touched)
8327 return maxchar;
8328 else
8329 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330}
8331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8336 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8337 const int kind = PyUnicode_KIND(self);
8338 void *data = PyUnicode_DATA(self);
8339 int touched = 0;
8340 Py_UCS4 maxchar = 0;
8341 Py_ssize_t i = 0;
8342 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008343
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008344 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008346
8347 ch = PyUnicode_READ(kind, data, i);
8348 if (!Py_UNICODE_ISUPPER(ch)) {
8349 maxchar = Py_UNICODE_TOUPPER(ch);
8350 PyUnicode_WRITE(kind, data, i, maxchar);
8351 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 ++i;
8354 for(; i < len; ++i) {
8355 ch = PyUnicode_READ(kind, data, i);
8356 if (!Py_UNICODE_ISLOWER(ch)) {
8357 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8358 if (lo > maxchar)
8359 maxchar = lo;
8360 PyUnicode_WRITE(kind, data, i, lo);
8361 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008363 else if (ch > maxchar)
8364 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366
8367 if (touched)
8368 return maxchar;
8369 else
8370 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371}
8372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008374fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8377 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8378 const int kind = PyUnicode_KIND(self);
8379 void *data = PyUnicode_DATA(self);
8380 Py_UCS4 maxchar = 0;
8381 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 int previous_is_cased;
8383
8384 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 if (len == 1) {
8386 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8387 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8388 if (ti != ch) {
8389 PyUnicode_WRITE(kind, data, i, ti);
8390 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 }
8392 else
8393 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396 for(; i < len; ++i) {
8397 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8398 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008399
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008403 nu = Py_UNICODE_TOTITLE(ch);
8404
8405 if (nu > maxchar)
8406 maxchar = nu;
8407 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008408
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 if (Py_UNICODE_ISLOWER(ch) ||
8410 Py_UNICODE_ISUPPER(ch) ||
8411 Py_UNICODE_ISTITLE(ch))
8412 previous_is_cased = 1;
8413 else
8414 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008416 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417}
8418
Tim Peters8ce9f162004-08-27 01:49:32 +00008419PyObject *
8420PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008423 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008425 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008426 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8427 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008428 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008429 Py_ssize_t sz, i, res_offset;
8430 Py_UCS4 maxchar = 0;
8431 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432
Tim Peters05eba1f2004-08-27 21:32:02 +00008433 fseq = PySequence_Fast(seq, "");
8434 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008435 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008436 }
8437
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008438 /* NOTE: the following code can't call back into Python code,
8439 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008440 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008441
Tim Peters05eba1f2004-08-27 21:32:02 +00008442 seqlen = PySequence_Fast_GET_SIZE(fseq);
8443 /* If empty sequence, return u"". */
8444 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008445 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008446 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008447 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008448 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008449 /* If singleton sequence with an exact Unicode, return that. */
8450 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 item = items[0];
8452 if (PyUnicode_CheckExact(item)) {
8453 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 goto Done;
8456 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008457 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008458 else {
8459 /* Set up sep and seplen */
8460 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461 /* fall back to a blank space separator */
8462 sep = PyUnicode_FromOrdinal(' ');
8463 if (!sep || PyUnicode_READY(sep) == -1)
8464 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008465 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008466 else {
8467 if (!PyUnicode_Check(separator)) {
8468 PyErr_Format(PyExc_TypeError,
8469 "separator: expected str instance,"
8470 " %.80s found",
8471 Py_TYPE(separator)->tp_name);
8472 goto onError;
8473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474 if (PyUnicode_READY(separator) == -1)
8475 goto onError;
8476 sep = separator;
8477 seplen = PyUnicode_GET_LENGTH(separator);
8478 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8479 /* inc refcount to keep this code path symetric with the
8480 above case of a blank separator */
8481 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008482 }
8483 }
8484
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008485 /* There are at least two things to join, or else we have a subclass
8486 * of str in the sequence.
8487 * Do a pre-pass to figure out the total amount of space we'll
8488 * need (sz), and see whether all argument are strings.
8489 */
8490 sz = 0;
8491 for (i = 0; i < seqlen; i++) {
8492 const Py_ssize_t old_sz = sz;
8493 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 if (!PyUnicode_Check(item)) {
8495 PyErr_Format(PyExc_TypeError,
8496 "sequence item %zd: expected str instance,"
8497 " %.80s found",
8498 i, Py_TYPE(item)->tp_name);
8499 goto onError;
8500 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 if (PyUnicode_READY(item) == -1)
8502 goto onError;
8503 sz += PyUnicode_GET_LENGTH(item);
8504 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8505 if (item_maxchar > maxchar)
8506 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008507 if (i != 0)
8508 sz += seplen;
8509 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8510 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008512 goto onError;
8513 }
8514 }
Tim Petersced69f82003-09-16 20:30:58 +00008515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008517 if (res == NULL)
8518 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008519
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008520 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008522 Py_ssize_t itemlen;
8523 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008524 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 /* Copy item, and maybe the separator. */
8526 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008527 if (PyUnicode_CopyCharacters(res, res_offset,
8528 sep, 0, seplen) < 0)
8529 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008532 if (PyUnicode_CopyCharacters(res, res_offset,
8533 item, 0, itemlen) < 0)
8534 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008538
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008540 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 Py_XDECREF(sep);
8542 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008545 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008547 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548 return NULL;
8549}
8550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551#define FILL(kind, data, value, start, length) \
8552 do { \
8553 Py_ssize_t i_ = 0; \
8554 assert(kind != PyUnicode_WCHAR_KIND); \
8555 switch ((kind)) { \
8556 case PyUnicode_1BYTE_KIND: { \
8557 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8558 memset(to_, (unsigned char)value, length); \
8559 break; \
8560 } \
8561 case PyUnicode_2BYTE_KIND: { \
8562 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8563 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8564 break; \
8565 } \
8566 default: { \
8567 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8568 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8569 break; \
8570 } \
8571 } \
8572 } while (0)
8573
Alexander Belopolsky40018472011-02-26 01:02:56 +00008574static PyUnicodeObject *
8575pad(PyUnicodeObject *self,
8576 Py_ssize_t left,
8577 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580 PyObject *u;
8581 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008582 int kind;
8583 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584
8585 if (left < 0)
8586 left = 0;
8587 if (right < 0)
8588 right = 0;
8589
Tim Peters7a29bd52001-09-12 03:03:31 +00008590 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 Py_INCREF(self);
8592 return self;
8593 }
8594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8596 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008597 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8598 return NULL;
8599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8601 if (fill > maxchar)
8602 maxchar = fill;
8603 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008604 if (!u)
8605 return NULL;
8606
8607 kind = PyUnicode_KIND(u);
8608 data = PyUnicode_DATA(u);
8609 if (left)
8610 FILL(kind, data, fill, 0, left);
8611 if (right)
8612 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008613 if (PyUnicode_CopyCharacters(u, left,
8614 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008615 _PyUnicode_LENGTH(self)) < 0)
8616 {
8617 Py_DECREF(u);
8618 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619 }
8620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624
Alexander Belopolsky40018472011-02-26 01:02:56 +00008625PyObject *
8626PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629
8630 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 switch(PyUnicode_KIND(string)) {
8635 case PyUnicode_1BYTE_KIND:
8636 list = ucs1lib_splitlines(
8637 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8638 PyUnicode_GET_LENGTH(string), keepends);
8639 break;
8640 case PyUnicode_2BYTE_KIND:
8641 list = ucs2lib_splitlines(
8642 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8643 PyUnicode_GET_LENGTH(string), keepends);
8644 break;
8645 case PyUnicode_4BYTE_KIND:
8646 list = ucs4lib_splitlines(
8647 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8648 PyUnicode_GET_LENGTH(string), keepends);
8649 break;
8650 default:
8651 assert(0);
8652 list = 0;
8653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654 Py_DECREF(string);
8655 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656}
8657
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658static PyObject *
8659split(PyUnicodeObject *self,
8660 PyUnicodeObject *substring,
8661 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 int kind1, kind2, kind;
8664 void *buf1, *buf2;
8665 Py_ssize_t len1, len2;
8666 PyObject* out;
8667
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008669 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 if (PyUnicode_READY(self) == -1)
8672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 if (substring == NULL)
8675 switch(PyUnicode_KIND(self)) {
8676 case PyUnicode_1BYTE_KIND:
8677 return ucs1lib_split_whitespace(
8678 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8679 PyUnicode_GET_LENGTH(self), maxcount
8680 );
8681 case PyUnicode_2BYTE_KIND:
8682 return ucs2lib_split_whitespace(
8683 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8684 PyUnicode_GET_LENGTH(self), maxcount
8685 );
8686 case PyUnicode_4BYTE_KIND:
8687 return ucs4lib_split_whitespace(
8688 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8689 PyUnicode_GET_LENGTH(self), maxcount
8690 );
8691 default:
8692 assert(0);
8693 return NULL;
8694 }
8695
8696 if (PyUnicode_READY(substring) == -1)
8697 return NULL;
8698
8699 kind1 = PyUnicode_KIND(self);
8700 kind2 = PyUnicode_KIND(substring);
8701 kind = kind1 > kind2 ? kind1 : kind2;
8702 buf1 = PyUnicode_DATA(self);
8703 buf2 = PyUnicode_DATA(substring);
8704 if (kind1 != kind)
8705 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8706 if (!buf1)
8707 return NULL;
8708 if (kind2 != kind)
8709 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8710 if (!buf2) {
8711 if (kind1 != kind) PyMem_Free(buf1);
8712 return NULL;
8713 }
8714 len1 = PyUnicode_GET_LENGTH(self);
8715 len2 = PyUnicode_GET_LENGTH(substring);
8716
8717 switch(kind) {
8718 case PyUnicode_1BYTE_KIND:
8719 out = ucs1lib_split(
8720 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8721 break;
8722 case PyUnicode_2BYTE_KIND:
8723 out = ucs2lib_split(
8724 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8725 break;
8726 case PyUnicode_4BYTE_KIND:
8727 out = ucs4lib_split(
8728 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8729 break;
8730 default:
8731 out = NULL;
8732 }
8733 if (kind1 != kind)
8734 PyMem_Free(buf1);
8735 if (kind2 != kind)
8736 PyMem_Free(buf2);
8737 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738}
8739
Alexander Belopolsky40018472011-02-26 01:02:56 +00008740static PyObject *
8741rsplit(PyUnicodeObject *self,
8742 PyUnicodeObject *substring,
8743 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008745 int kind1, kind2, kind;
8746 void *buf1, *buf2;
8747 Py_ssize_t len1, len2;
8748 PyObject* out;
8749
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008750 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008751 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 if (PyUnicode_READY(self) == -1)
8754 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756 if (substring == NULL)
8757 switch(PyUnicode_KIND(self)) {
8758 case PyUnicode_1BYTE_KIND:
8759 return ucs1lib_rsplit_whitespace(
8760 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8761 PyUnicode_GET_LENGTH(self), maxcount
8762 );
8763 case PyUnicode_2BYTE_KIND:
8764 return ucs2lib_rsplit_whitespace(
8765 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8766 PyUnicode_GET_LENGTH(self), maxcount
8767 );
8768 case PyUnicode_4BYTE_KIND:
8769 return ucs4lib_rsplit_whitespace(
8770 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8771 PyUnicode_GET_LENGTH(self), maxcount
8772 );
8773 default:
8774 assert(0);
8775 return NULL;
8776 }
8777
8778 if (PyUnicode_READY(substring) == -1)
8779 return NULL;
8780
8781 kind1 = PyUnicode_KIND(self);
8782 kind2 = PyUnicode_KIND(substring);
8783 kind = kind1 > kind2 ? kind1 : kind2;
8784 buf1 = PyUnicode_DATA(self);
8785 buf2 = PyUnicode_DATA(substring);
8786 if (kind1 != kind)
8787 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8788 if (!buf1)
8789 return NULL;
8790 if (kind2 != kind)
8791 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8792 if (!buf2) {
8793 if (kind1 != kind) PyMem_Free(buf1);
8794 return NULL;
8795 }
8796 len1 = PyUnicode_GET_LENGTH(self);
8797 len2 = PyUnicode_GET_LENGTH(substring);
8798
8799 switch(kind) {
8800 case PyUnicode_1BYTE_KIND:
8801 out = ucs1lib_rsplit(
8802 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8803 break;
8804 case PyUnicode_2BYTE_KIND:
8805 out = ucs2lib_rsplit(
8806 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8807 break;
8808 case PyUnicode_4BYTE_KIND:
8809 out = ucs4lib_rsplit(
8810 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8811 break;
8812 default:
8813 out = NULL;
8814 }
8815 if (kind1 != kind)
8816 PyMem_Free(buf1);
8817 if (kind2 != kind)
8818 PyMem_Free(buf2);
8819 return out;
8820}
8821
8822static Py_ssize_t
8823anylib_find(int kind, void *buf1, Py_ssize_t len1,
8824 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8825{
8826 switch(kind) {
8827 case PyUnicode_1BYTE_KIND:
8828 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8829 case PyUnicode_2BYTE_KIND:
8830 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8831 case PyUnicode_4BYTE_KIND:
8832 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8833 }
8834 assert(0);
8835 return -1;
8836}
8837
8838static Py_ssize_t
8839anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8840 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8841{
8842 switch(kind) {
8843 case PyUnicode_1BYTE_KIND:
8844 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8845 case PyUnicode_2BYTE_KIND:
8846 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8847 case PyUnicode_4BYTE_KIND:
8848 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8849 }
8850 assert(0);
8851 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008852}
8853
Alexander Belopolsky40018472011-02-26 01:02:56 +00008854static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855replace(PyObject *self, PyObject *str1,
8856 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 PyObject *u;
8859 char *sbuf = PyUnicode_DATA(self);
8860 char *buf1 = PyUnicode_DATA(str1);
8861 char *buf2 = PyUnicode_DATA(str2);
8862 int srelease = 0, release1 = 0, release2 = 0;
8863 int skind = PyUnicode_KIND(self);
8864 int kind1 = PyUnicode_KIND(str1);
8865 int kind2 = PyUnicode_KIND(str2);
8866 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8867 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8868 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869
8870 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008871 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008873 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 if (skind < kind1)
8876 /* substring too wide to be present */
8877 goto nothing;
8878
8879 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008880 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008881 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008883 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008885 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 Py_UCS4 u1, u2, maxchar;
8887 int mayshrink, rkind;
8888 u1 = PyUnicode_READ_CHAR(str1, 0);
8889 if (!findchar(sbuf, PyUnicode_KIND(self),
8890 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008891 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008892 u2 = PyUnicode_READ_CHAR(str2, 0);
8893 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8894 /* Replacing u1 with u2 may cause a maxchar reduction in the
8895 result string. */
8896 mayshrink = maxchar > 127;
8897 if (u2 > maxchar) {
8898 maxchar = u2;
8899 mayshrink = 0;
8900 }
8901 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008902 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008904 if (PyUnicode_CopyCharacters(u, 0,
8905 (PyObject*)self, 0, slen) < 0)
8906 {
8907 Py_DECREF(u);
8908 return NULL;
8909 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910 rkind = PyUnicode_KIND(u);
8911 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8912 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008913 if (--maxcount < 0)
8914 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917 if (mayshrink) {
8918 PyObject *tmp = u;
8919 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8920 PyUnicode_GET_LENGTH(tmp));
8921 Py_DECREF(tmp);
8922 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 int rkind = skind;
8925 char *res;
8926 if (kind1 < rkind) {
8927 /* widen substring */
8928 buf1 = _PyUnicode_AsKind(str1, rkind);
8929 if (!buf1) goto error;
8930 release1 = 1;
8931 }
8932 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008933 if (i < 0)
8934 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 if (rkind > kind2) {
8936 /* widen replacement */
8937 buf2 = _PyUnicode_AsKind(str2, rkind);
8938 if (!buf2) goto error;
8939 release2 = 1;
8940 }
8941 else if (rkind < kind2) {
8942 /* widen self and buf1 */
8943 rkind = kind2;
8944 if (release1) PyMem_Free(buf1);
8945 sbuf = _PyUnicode_AsKind(self, rkind);
8946 if (!sbuf) goto error;
8947 srelease = 1;
8948 buf1 = _PyUnicode_AsKind(str1, rkind);
8949 if (!buf1) goto error;
8950 release1 = 1;
8951 }
8952 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8953 if (!res) {
8954 PyErr_NoMemory();
8955 goto error;
8956 }
8957 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008958 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8960 buf2,
8961 PyUnicode_KIND_SIZE(rkind, len2));
8962 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008963
8964 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8966 slen-i,
8967 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008968 if (i == -1)
8969 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8971 buf2,
8972 PyUnicode_KIND_SIZE(rkind, len2));
8973 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008974 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975
8976 u = PyUnicode_FromKindAndData(rkind, res, slen);
8977 PyMem_Free(res);
8978 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 Py_ssize_t n, i, j, ires;
8983 Py_ssize_t product, new_size;
8984 int rkind = skind;
8985 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987 if (kind1 < rkind) {
8988 buf1 = _PyUnicode_AsKind(str1, rkind);
8989 if (!buf1) goto error;
8990 release1 = 1;
8991 }
8992 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008993 if (n == 0)
8994 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995 if (kind2 < rkind) {
8996 buf2 = _PyUnicode_AsKind(str2, rkind);
8997 if (!buf2) goto error;
8998 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 else if (kind2 > rkind) {
9001 rkind = kind2;
9002 sbuf = _PyUnicode_AsKind(self, rkind);
9003 if (!sbuf) goto error;
9004 srelease = 1;
9005 if (release1) PyMem_Free(buf1);
9006 buf1 = _PyUnicode_AsKind(str1, rkind);
9007 if (!buf1) goto error;
9008 release1 = 1;
9009 }
9010 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9011 PyUnicode_GET_LENGTH(str1))); */
9012 product = n * (len2-len1);
9013 if ((product / (len2-len1)) != n) {
9014 PyErr_SetString(PyExc_OverflowError,
9015 "replace string is too long");
9016 goto error;
9017 }
9018 new_size = slen + product;
9019 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9020 PyErr_SetString(PyExc_OverflowError,
9021 "replace string is too long");
9022 goto error;
9023 }
9024 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9025 if (!res)
9026 goto error;
9027 ires = i = 0;
9028 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009029 while (n-- > 0) {
9030 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 j = anylib_find(rkind,
9032 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9033 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009034 if (j == -1)
9035 break;
9036 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009037 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9039 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9040 PyUnicode_KIND_SIZE(rkind, j-i));
9041 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009042 }
9043 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 if (len2 > 0) {
9045 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9046 buf2,
9047 PyUnicode_KIND_SIZE(rkind, len2));
9048 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009053 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9055 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9056 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009057 } else {
9058 /* interleave */
9059 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9061 buf2,
9062 PyUnicode_KIND_SIZE(rkind, len2));
9063 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009064 if (--n <= 0)
9065 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9067 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9068 PyUnicode_KIND_SIZE(rkind, 1));
9069 ires++;
9070 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009071 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9073 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9074 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 if (srelease)
9079 PyMem_FREE(sbuf);
9080 if (release1)
9081 PyMem_FREE(buf1);
9082 if (release2)
9083 PyMem_FREE(buf2);
9084 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009085
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009087 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 if (srelease)
9089 PyMem_FREE(sbuf);
9090 if (release1)
9091 PyMem_FREE(buf1);
9092 if (release2)
9093 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009094 if (PyUnicode_CheckExact(self)) {
9095 Py_INCREF(self);
9096 return (PyObject *) self;
9097 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009098 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 error:
9100 if (srelease && sbuf)
9101 PyMem_FREE(sbuf);
9102 if (release1 && buf1)
9103 PyMem_FREE(buf1);
9104 if (release2 && buf2)
9105 PyMem_FREE(buf2);
9106 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107}
9108
9109/* --- Unicode Object Methods --------------------------------------------- */
9110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009111PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009112 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113\n\
9114Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009115characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116
9117static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009118unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120 return fixup(self, fixtitle);
9121}
9122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009123PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009124 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125\n\
9126Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009127have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128
9129static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009130unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 return fixup(self, fixcapitalize);
9133}
9134
9135#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009136PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138\n\
9139Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009140normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141
9142static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009143unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144{
9145 PyObject *list;
9146 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009147 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149 /* Split into words */
9150 list = split(self, NULL, -1);
9151 if (!list)
9152 return NULL;
9153
9154 /* Capitalize each word */
9155 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9156 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158 if (item == NULL)
9159 goto onError;
9160 Py_DECREF(PyList_GET_ITEM(list, i));
9161 PyList_SET_ITEM(list, i, item);
9162 }
9163
9164 /* Join the words to form a new string */
9165 item = PyUnicode_Join(NULL, list);
9166
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168 Py_DECREF(list);
9169 return (PyObject *)item;
9170}
9171#endif
9172
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009173/* Argument converter. Coerces to a single unicode character */
9174
9175static int
9176convert_uc(PyObject *obj, void *addr)
9177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009179 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009180
Benjamin Peterson14339b62009-01-31 16:36:08 +00009181 uniobj = PyUnicode_FromObject(obj);
9182 if (uniobj == NULL) {
9183 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009184 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009185 return 0;
9186 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009188 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009189 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009190 Py_DECREF(uniobj);
9191 return 0;
9192 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 if (PyUnicode_READY(uniobj)) {
9194 Py_DECREF(uniobj);
9195 return 0;
9196 }
9197 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009198 Py_DECREF(uniobj);
9199 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009200}
9201
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009202PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009203 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009205Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009206done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207
9208static PyObject *
9209unicode_center(PyUnicodeObject *self, PyObject *args)
9210{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009211 Py_ssize_t marg, left;
9212 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213 Py_UCS4 fillchar = ' ';
9214
9215 if (PyUnicode_READY(self) == -1)
9216 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217
Thomas Woutersde017742006-02-16 19:34:37 +00009218 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219 return NULL;
9220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222 Py_INCREF(self);
9223 return (PyObject*) self;
9224 }
9225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227 left = marg / 2 + (marg & width & 1);
9228
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009229 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230}
9231
Marc-André Lemburge5034372000-08-08 08:04:29 +00009232#if 0
9233
9234/* This code should go into some future Unicode collation support
9235 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009236 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009237
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009238/* speedy UTF-16 code point order comparison */
9239/* gleaned from: */
9240/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9241
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009242static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009243{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009244 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009245 0, 0, 0, 0, 0, 0, 0, 0,
9246 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009247 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009248};
9249
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250static int
9251unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9252{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009253 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009254
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255 Py_UNICODE *s1 = str1->str;
9256 Py_UNICODE *s2 = str2->str;
9257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 len1 = str1->_base._base.length;
9259 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009260
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009262 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009263
9264 c1 = *s1++;
9265 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009266
Benjamin Peterson29060642009-01-31 22:14:21 +00009267 if (c1 > (1<<11) * 26)
9268 c1 += utf16Fixup[c1>>11];
9269 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009270 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009271 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009272
9273 if (c1 != c2)
9274 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009275
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009276 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009277 }
9278
9279 return (len1 < len2) ? -1 : (len1 != len2);
9280}
9281
Marc-André Lemburge5034372000-08-08 08:04:29 +00009282#else
9283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009284/* This function assumes that str1 and str2 are readied by the caller. */
9285
Marc-André Lemburge5034372000-08-08 08:04:29 +00009286static int
9287unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9288{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 int kind1, kind2;
9290 void *data1, *data2;
9291 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 kind1 = PyUnicode_KIND(str1);
9294 kind2 = PyUnicode_KIND(str2);
9295 data1 = PyUnicode_DATA(str1);
9296 data2 = PyUnicode_DATA(str2);
9297 len1 = PyUnicode_GET_LENGTH(str1);
9298 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300 for (i = 0; i < len1 && i < len2; ++i) {
9301 Py_UCS4 c1, c2;
9302 c1 = PyUnicode_READ(kind1, data1, i);
9303 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009304
9305 if (c1 != c2)
9306 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009307 }
9308
9309 return (len1 < len2) ? -1 : (len1 != len2);
9310}
9311
9312#endif
9313
Alexander Belopolsky40018472011-02-26 01:02:56 +00009314int
9315PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9318 if (PyUnicode_READY(left) == -1 ||
9319 PyUnicode_READY(right) == -1)
9320 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009321 return unicode_compare((PyUnicodeObject *)left,
9322 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009324 PyErr_Format(PyExc_TypeError,
9325 "Can't compare %.100s and %.100s",
9326 left->ob_type->tp_name,
9327 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009328 return -1;
9329}
9330
Martin v. Löwis5b222132007-06-10 09:51:05 +00009331int
9332PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9333{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334 Py_ssize_t i;
9335 int kind;
9336 void *data;
9337 Py_UCS4 chr;
9338
Martin v. Löwis5b222132007-06-10 09:51:05 +00009339 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009340 if (PyUnicode_READY(uni) == -1)
9341 return -1;
9342 kind = PyUnicode_KIND(uni);
9343 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009344 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9346 if (chr != str[i])
9347 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009348 /* This check keeps Python strings that end in '\0' from comparing equal
9349 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009352 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009353 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009354 return 0;
9355}
9356
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009357
Benjamin Peterson29060642009-01-31 22:14:21 +00009358#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009359 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009360
Alexander Belopolsky40018472011-02-26 01:02:56 +00009361PyObject *
9362PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009363{
9364 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009365
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009366 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9367 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009368 if (PyUnicode_READY(left) == -1 ||
9369 PyUnicode_READY(right) == -1)
9370 return NULL;
9371 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9372 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009373 if (op == Py_EQ) {
9374 Py_INCREF(Py_False);
9375 return Py_False;
9376 }
9377 if (op == Py_NE) {
9378 Py_INCREF(Py_True);
9379 return Py_True;
9380 }
9381 }
9382 if (left == right)
9383 result = 0;
9384 else
9385 result = unicode_compare((PyUnicodeObject *)left,
9386 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009387
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009388 /* Convert the return value to a Boolean */
9389 switch (op) {
9390 case Py_EQ:
9391 v = TEST_COND(result == 0);
9392 break;
9393 case Py_NE:
9394 v = TEST_COND(result != 0);
9395 break;
9396 case Py_LE:
9397 v = TEST_COND(result <= 0);
9398 break;
9399 case Py_GE:
9400 v = TEST_COND(result >= 0);
9401 break;
9402 case Py_LT:
9403 v = TEST_COND(result == -1);
9404 break;
9405 case Py_GT:
9406 v = TEST_COND(result == 1);
9407 break;
9408 default:
9409 PyErr_BadArgument();
9410 return NULL;
9411 }
9412 Py_INCREF(v);
9413 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009415
Brian Curtindfc80e32011-08-10 20:28:54 -05009416 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009417}
9418
Alexander Belopolsky40018472011-02-26 01:02:56 +00009419int
9420PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009421{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009422 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 int kind1, kind2, kind;
9424 void *buf1, *buf2;
9425 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009426 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009427
9428 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009429 sub = PyUnicode_FromObject(element);
9430 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009431 PyErr_Format(PyExc_TypeError,
9432 "'in <string>' requires string as left operand, not %s",
9433 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009434 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 if (PyUnicode_READY(sub) == -1)
9437 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009438
Thomas Wouters477c8d52006-05-27 19:21:47 +00009439 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009441 Py_DECREF(sub);
9442 return -1;
9443 }
9444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 kind1 = PyUnicode_KIND(str);
9446 kind2 = PyUnicode_KIND(sub);
9447 kind = kind1 > kind2 ? kind1 : kind2;
9448 buf1 = PyUnicode_DATA(str);
9449 buf2 = PyUnicode_DATA(sub);
9450 if (kind1 != kind)
9451 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9452 if (!buf1) {
9453 Py_DECREF(sub);
9454 return -1;
9455 }
9456 if (kind2 != kind)
9457 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9458 if (!buf2) {
9459 Py_DECREF(sub);
9460 if (kind1 != kind) PyMem_Free(buf1);
9461 return -1;
9462 }
9463 len1 = PyUnicode_GET_LENGTH(str);
9464 len2 = PyUnicode_GET_LENGTH(sub);
9465
9466 switch(kind) {
9467 case PyUnicode_1BYTE_KIND:
9468 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9469 break;
9470 case PyUnicode_2BYTE_KIND:
9471 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9472 break;
9473 case PyUnicode_4BYTE_KIND:
9474 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9475 break;
9476 default:
9477 result = -1;
9478 assert(0);
9479 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009480
9481 Py_DECREF(str);
9482 Py_DECREF(sub);
9483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 if (kind1 != kind)
9485 PyMem_Free(buf1);
9486 if (kind2 != kind)
9487 PyMem_Free(buf2);
9488
Guido van Rossum403d68b2000-03-13 15:55:09 +00009489 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009490}
9491
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492/* Concat to string or Unicode object giving a new Unicode object. */
9493
Alexander Belopolsky40018472011-02-26 01:02:56 +00009494PyObject *
9495PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 PyObject *u = NULL, *v = NULL, *w;
9498 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499
9500 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009506 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507
9508 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009514 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 }
9517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9519 goto onError;
9520
9521 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009522 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525 w = PyUnicode_New(
9526 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9527 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009529 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009530 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9531 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009532 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009533 v, 0,
9534 PyUnicode_GET_LENGTH(v)) < 0)
9535 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536 Py_DECREF(u);
9537 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539
Benjamin Peterson29060642009-01-31 22:14:21 +00009540 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541 Py_XDECREF(u);
9542 Py_XDECREF(v);
9543 return NULL;
9544}
9545
Walter Dörwald1ab83302007-05-18 17:15:44 +00009546void
9547PyUnicode_Append(PyObject **pleft, PyObject *right)
9548{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009549 PyObject *new;
9550 if (*pleft == NULL)
9551 return;
9552 if (right == NULL || !PyUnicode_Check(*pleft)) {
9553 Py_DECREF(*pleft);
9554 *pleft = NULL;
9555 return;
9556 }
9557 new = PyUnicode_Concat(*pleft, right);
9558 Py_DECREF(*pleft);
9559 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009560}
9561
9562void
9563PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9564{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009565 PyUnicode_Append(pleft, right);
9566 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009567}
9568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009569PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009572Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009573string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009574interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575
9576static PyObject *
9577unicode_count(PyUnicodeObject *self, PyObject *args)
9578{
9579 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009580 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009581 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 int kind1, kind2, kind;
9584 void *buf1, *buf2;
9585 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586
Jesus Ceaac451502011-04-20 17:09:23 +02009587 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9588 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009589 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 kind1 = PyUnicode_KIND(self);
9592 kind2 = PyUnicode_KIND(substring);
9593 kind = kind1 > kind2 ? kind1 : kind2;
9594 buf1 = PyUnicode_DATA(self);
9595 buf2 = PyUnicode_DATA(substring);
9596 if (kind1 != kind)
9597 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9598 if (!buf1) {
9599 Py_DECREF(substring);
9600 return NULL;
9601 }
9602 if (kind2 != kind)
9603 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9604 if (!buf2) {
9605 Py_DECREF(substring);
9606 if (kind1 != kind) PyMem_Free(buf1);
9607 return NULL;
9608 }
9609 len1 = PyUnicode_GET_LENGTH(self);
9610 len2 = PyUnicode_GET_LENGTH(substring);
9611
9612 ADJUST_INDICES(start, end, len1);
9613 switch(kind) {
9614 case PyUnicode_1BYTE_KIND:
9615 iresult = ucs1lib_count(
9616 ((Py_UCS1*)buf1) + start, end - start,
9617 buf2, len2, PY_SSIZE_T_MAX
9618 );
9619 break;
9620 case PyUnicode_2BYTE_KIND:
9621 iresult = ucs2lib_count(
9622 ((Py_UCS2*)buf1) + start, end - start,
9623 buf2, len2, PY_SSIZE_T_MAX
9624 );
9625 break;
9626 case PyUnicode_4BYTE_KIND:
9627 iresult = ucs4lib_count(
9628 ((Py_UCS4*)buf1) + start, end - start,
9629 buf2, len2, PY_SSIZE_T_MAX
9630 );
9631 break;
9632 default:
9633 assert(0); iresult = 0;
9634 }
9635
9636 result = PyLong_FromSsize_t(iresult);
9637
9638 if (kind1 != kind)
9639 PyMem_Free(buf1);
9640 if (kind2 != kind)
9641 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642
9643 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009644
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645 return result;
9646}
9647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009648PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009649 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009651Encode S using the codec registered for encoding. Default encoding\n\
9652is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009653handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009654a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9655'xmlcharrefreplace' as well as any other name registered with\n\
9656codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657
9658static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009659unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009661 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662 char *encoding = NULL;
9663 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009664
Benjamin Peterson308d6372009-09-18 21:42:35 +00009665 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9666 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009668 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009669}
9670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009671PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009672 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673\n\
9674Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009675If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676
9677static PyObject*
9678unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9679{
9680 Py_UNICODE *e;
9681 Py_UNICODE *p;
9682 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009683 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685 PyUnicodeObject *u;
9686 int tabsize = 8;
9687
9688 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009689 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009690
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9692 return NULL;
9693
Thomas Wouters7e474022000-07-16 12:04:32 +00009694 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009695 i = 0; /* chars up to and including most recent \n or \r */
9696 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9698 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009700 if (tabsize > 0) {
9701 incr = tabsize - (j % tabsize); /* cannot overflow */
9702 if (j > PY_SSIZE_T_MAX - incr)
9703 goto overflow1;
9704 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009705 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009707 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009708 if (j > PY_SSIZE_T_MAX - 1)
9709 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710 j++;
9711 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009712 if (i > PY_SSIZE_T_MAX - j)
9713 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009715 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716 }
9717 }
9718
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009719 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009720 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009721
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722 /* Second pass: create output string and fill it */
9723 u = _PyUnicode_New(i + j);
9724 if (!u)
9725 return NULL;
9726
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009727 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 q = _PyUnicode_WSTR(u); /* next output char */
9729 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009732 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009733 if (tabsize > 0) {
9734 i = tabsize - (j % tabsize);
9735 j += i;
9736 while (i--) {
9737 if (q >= qe)
9738 goto overflow2;
9739 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009740 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009741 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009742 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009743 else {
9744 if (q >= qe)
9745 goto overflow2;
9746 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009747 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748 if (*p == '\n' || *p == '\r')
9749 j = 0;
9750 }
9751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 if (PyUnicode_READY(u) == -1) {
9753 Py_DECREF(u);
9754 return NULL;
9755 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009757
9758 overflow2:
9759 Py_DECREF(u);
9760 overflow1:
9761 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9762 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009763}
9764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009765PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009766 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767\n\
9768Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009769such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770arguments start and end are interpreted as in slice notation.\n\
9771\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009772Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773
9774static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776{
Jesus Ceaac451502011-04-20 17:09:23 +02009777 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009778 Py_ssize_t start;
9779 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009780 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009781
Jesus Ceaac451502011-04-20 17:09:23 +02009782 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9783 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009784 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 if (PyUnicode_READY(self) == -1)
9787 return NULL;
9788 if (PyUnicode_READY(substring) == -1)
9789 return NULL;
9790
9791 result = any_find_slice(
9792 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9793 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009794 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795
9796 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009798 if (result == -2)
9799 return NULL;
9800
Christian Heimes217cfd12007-12-02 14:31:20 +00009801 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802}
9803
9804static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009805unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 Py_UCS4 ch;
9808
9809 if (PyUnicode_READY(self) == -1)
9810 return NULL;
9811 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812 PyErr_SetString(PyExc_IndexError, "string index out of range");
9813 return NULL;
9814 }
9815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9817 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818}
9819
Guido van Rossumc2504932007-09-18 19:42:40 +00009820/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009821 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009822static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009823unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824{
Guido van Rossumc2504932007-09-18 19:42:40 +00009825 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009826 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 if (_PyUnicode_HASH(self) != -1)
9829 return _PyUnicode_HASH(self);
9830 if (PyUnicode_READY(self) == -1)
9831 return -1;
9832 len = PyUnicode_GET_LENGTH(self);
9833
9834 /* The hash function as a macro, gets expanded three times below. */
9835#define HASH(P) \
9836 x = (Py_uhash_t)*P << 7; \
9837 while (--len >= 0) \
9838 x = (1000003*x) ^ (Py_uhash_t)*P++;
9839
9840 switch (PyUnicode_KIND(self)) {
9841 case PyUnicode_1BYTE_KIND: {
9842 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9843 HASH(c);
9844 break;
9845 }
9846 case PyUnicode_2BYTE_KIND: {
9847 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9848 HASH(s);
9849 break;
9850 }
9851 default: {
9852 Py_UCS4 *l;
9853 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9854 "Impossible switch case in unicode_hash");
9855 l = PyUnicode_4BYTE_DATA(self);
9856 HASH(l);
9857 break;
9858 }
9859 }
9860 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9861
Guido van Rossumc2504932007-09-18 19:42:40 +00009862 if (x == -1)
9863 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009865 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009869PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009870 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009872Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873
9874static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009876{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009877 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009878 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009879 Py_ssize_t start;
9880 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881
Jesus Ceaac451502011-04-20 17:09:23 +02009882 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9883 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 if (PyUnicode_READY(self) == -1)
9887 return NULL;
9888 if (PyUnicode_READY(substring) == -1)
9889 return NULL;
9890
9891 result = any_find_slice(
9892 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9893 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009894 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895
9896 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 if (result == -2)
9899 return NULL;
9900
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901 if (result < 0) {
9902 PyErr_SetString(PyExc_ValueError, "substring not found");
9903 return NULL;
9904 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009905
Christian Heimes217cfd12007-12-02 14:31:20 +00009906 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907}
9908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009909PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009910 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009912Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009913at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914
9915static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009916unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 Py_ssize_t i, length;
9919 int kind;
9920 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921 int cased;
9922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 if (PyUnicode_READY(self) == -1)
9924 return NULL;
9925 length = PyUnicode_GET_LENGTH(self);
9926 kind = PyUnicode_KIND(self);
9927 data = PyUnicode_DATA(self);
9928
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 if (length == 1)
9931 return PyBool_FromLong(
9932 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009934 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009936 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009937
Guido van Rossumd57fd912000-03-10 22:53:23 +00009938 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 for (i = 0; i < length; i++) {
9940 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009941
Benjamin Peterson29060642009-01-31 22:14:21 +00009942 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9943 return PyBool_FromLong(0);
9944 else if (!cased && Py_UNICODE_ISLOWER(ch))
9945 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009947 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948}
9949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009950PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009951 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009953Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009954at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955
9956static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009957unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 Py_ssize_t i, length;
9960 int kind;
9961 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009962 int cased;
9963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 if (PyUnicode_READY(self) == -1)
9965 return NULL;
9966 length = PyUnicode_GET_LENGTH(self);
9967 kind = PyUnicode_KIND(self);
9968 data = PyUnicode_DATA(self);
9969
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 if (length == 1)
9972 return PyBool_FromLong(
9973 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009974
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009975 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009977 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009978
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 for (i = 0; i < length; i++) {
9981 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009982
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9984 return PyBool_FromLong(0);
9985 else if (!cased && Py_UNICODE_ISUPPER(ch))
9986 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009988 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989}
9990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009991PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009992 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009994Return True if S is a titlecased string and there is at least one\n\
9995character in S, i.e. upper- and titlecase characters may only\n\
9996follow uncased characters and lowercase characters only cased ones.\n\
9997Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009998
9999static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010000unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 Py_ssize_t i, length;
10003 int kind;
10004 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010005 int cased, previous_is_cased;
10006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 if (PyUnicode_READY(self) == -1)
10008 return NULL;
10009 length = PyUnicode_GET_LENGTH(self);
10010 kind = PyUnicode_KIND(self);
10011 data = PyUnicode_DATA(self);
10012
Guido van Rossumd57fd912000-03-10 22:53:23 +000010013 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 if (length == 1) {
10015 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10016 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10017 (Py_UNICODE_ISUPPER(ch) != 0));
10018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010020 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010022 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010023
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024 cased = 0;
10025 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 for (i = 0; i < length; i++) {
10027 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010028
Benjamin Peterson29060642009-01-31 22:14:21 +000010029 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10030 if (previous_is_cased)
10031 return PyBool_FromLong(0);
10032 previous_is_cased = 1;
10033 cased = 1;
10034 }
10035 else if (Py_UNICODE_ISLOWER(ch)) {
10036 if (!previous_is_cased)
10037 return PyBool_FromLong(0);
10038 previous_is_cased = 1;
10039 cased = 1;
10040 }
10041 else
10042 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010044 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045}
10046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010047PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010048 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010049\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010050Return True if all characters in S are whitespace\n\
10051and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010052
10053static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010054unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 Py_ssize_t i, length;
10057 int kind;
10058 void *data;
10059
10060 if (PyUnicode_READY(self) == -1)
10061 return NULL;
10062 length = PyUnicode_GET_LENGTH(self);
10063 kind = PyUnicode_KIND(self);
10064 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065
Guido van Rossumd57fd912000-03-10 22:53:23 +000010066 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 if (length == 1)
10068 return PyBool_FromLong(
10069 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010070
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010071 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010073 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 for (i = 0; i < length; i++) {
10076 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010077 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010078 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010080 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081}
10082
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010083PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010084 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010085\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010086Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010087and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010088
10089static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010090unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 Py_ssize_t i, length;
10093 int kind;
10094 void *data;
10095
10096 if (PyUnicode_READY(self) == -1)
10097 return NULL;
10098 length = PyUnicode_GET_LENGTH(self);
10099 kind = PyUnicode_KIND(self);
10100 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010101
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010102 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 if (length == 1)
10104 return PyBool_FromLong(
10105 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010106
10107 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010109 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 for (i = 0; i < length; i++) {
10112 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010113 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010114 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010115 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010116}
10117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010118PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010119 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010120\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010121Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010122and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010123
10124static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010125unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010126{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 int kind;
10128 void *data;
10129 Py_ssize_t len, i;
10130
10131 if (PyUnicode_READY(self) == -1)
10132 return NULL;
10133
10134 kind = PyUnicode_KIND(self);
10135 data = PyUnicode_DATA(self);
10136 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010137
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010138 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 if (len == 1) {
10140 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10141 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10142 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010143
10144 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010146 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 for (i = 0; i < len; i++) {
10149 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010150 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010151 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010152 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010153 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010154}
10155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010156PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010157 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010159Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010160False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161
10162static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010163unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 Py_ssize_t i, length;
10166 int kind;
10167 void *data;
10168
10169 if (PyUnicode_READY(self) == -1)
10170 return NULL;
10171 length = PyUnicode_GET_LENGTH(self);
10172 kind = PyUnicode_KIND(self);
10173 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 if (length == 1)
10177 return PyBool_FromLong(
10178 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010180 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010182 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 for (i = 0; i < length; i++) {
10185 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010186 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010188 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189}
10190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010191PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010192 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010194Return True if all characters in S are digits\n\
10195and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196
10197static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010198unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 Py_ssize_t i, length;
10201 int kind;
10202 void *data;
10203
10204 if (PyUnicode_READY(self) == -1)
10205 return NULL;
10206 length = PyUnicode_GET_LENGTH(self);
10207 kind = PyUnicode_KIND(self);
10208 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 if (length == 1) {
10212 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10213 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010215
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010216 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010218 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 for (i = 0; i < length; i++) {
10221 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010222 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010224 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225}
10226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010227PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010228 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010230Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010231False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232
10233static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010234unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 Py_ssize_t i, length;
10237 int kind;
10238 void *data;
10239
10240 if (PyUnicode_READY(self) == -1)
10241 return NULL;
10242 length = PyUnicode_GET_LENGTH(self);
10243 kind = PyUnicode_KIND(self);
10244 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 if (length == 1)
10248 return PyBool_FromLong(
10249 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010251 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010253 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 for (i = 0; i < length; i++) {
10256 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010257 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010259 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260}
10261
Martin v. Löwis47383402007-08-15 07:32:56 +000010262int
10263PyUnicode_IsIdentifier(PyObject *self)
10264{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 int kind;
10266 void *data;
10267 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010268 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 if (PyUnicode_READY(self) == -1) {
10271 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010272 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 }
10274
10275 /* Special case for empty strings */
10276 if (PyUnicode_GET_LENGTH(self) == 0)
10277 return 0;
10278 kind = PyUnicode_KIND(self);
10279 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010280
10281 /* PEP 3131 says that the first character must be in
10282 XID_Start and subsequent characters in XID_Continue,
10283 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010284 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010285 letters, digits, underscore). However, given the current
10286 definition of XID_Start and XID_Continue, it is sufficient
10287 to check just for these, except that _ must be allowed
10288 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010290 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010291 return 0;
10292
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010293 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010295 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010296 return 1;
10297}
10298
10299PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010300 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010301\n\
10302Return True if S is a valid identifier according\n\
10303to the language definition.");
10304
10305static PyObject*
10306unicode_isidentifier(PyObject *self)
10307{
10308 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10309}
10310
Georg Brandl559e5d72008-06-11 18:37:52 +000010311PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010312 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010313\n\
10314Return True if all characters in S are considered\n\
10315printable in repr() or S is empty, False otherwise.");
10316
10317static PyObject*
10318unicode_isprintable(PyObject *self)
10319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 Py_ssize_t i, length;
10321 int kind;
10322 void *data;
10323
10324 if (PyUnicode_READY(self) == -1)
10325 return NULL;
10326 length = PyUnicode_GET_LENGTH(self);
10327 kind = PyUnicode_KIND(self);
10328 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010329
10330 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 if (length == 1)
10332 return PyBool_FromLong(
10333 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 for (i = 0; i < length; i++) {
10336 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010337 Py_RETURN_FALSE;
10338 }
10339 }
10340 Py_RETURN_TRUE;
10341}
10342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010343PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010344 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345\n\
10346Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010347iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348
10349static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010350unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010352 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010353}
10354
Martin v. Löwis18e16552006-02-15 17:27:45 +000010355static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356unicode_length(PyUnicodeObject *self)
10357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 if (PyUnicode_READY(self) == -1)
10359 return -1;
10360 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361}
10362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010363PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010364 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010366Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010367done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368
10369static PyObject *
10370unicode_ljust(PyUnicodeObject *self, PyObject *args)
10371{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010372 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 Py_UCS4 fillchar = ' ';
10374
10375 if (PyUnicode_READY(self) == -1)
10376 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010377
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010378 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379 return NULL;
10380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382 Py_INCREF(self);
10383 return (PyObject*) self;
10384 }
10385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387}
10388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010389PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010390 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010392Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393
10394static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010395unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397 return fixup(self, fixlower);
10398}
10399
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010400#define LEFTSTRIP 0
10401#define RIGHTSTRIP 1
10402#define BOTHSTRIP 2
10403
10404/* Arrays indexed by above */
10405static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10406
10407#define STRIPNAME(i) (stripformat[i]+3)
10408
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010409/* externally visible for str.strip(unicode) */
10410PyObject *
10411_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10412{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 void *data;
10414 int kind;
10415 Py_ssize_t i, j, len;
10416 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10419 return NULL;
10420
10421 kind = PyUnicode_KIND(self);
10422 data = PyUnicode_DATA(self);
10423 len = PyUnicode_GET_LENGTH(self);
10424 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10425 PyUnicode_DATA(sepobj),
10426 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010427
Benjamin Peterson14339b62009-01-31 16:36:08 +000010428 i = 0;
10429 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 while (i < len &&
10431 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010432 i++;
10433 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010434 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010435
Benjamin Peterson14339b62009-01-31 16:36:08 +000010436 j = len;
10437 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010438 do {
10439 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 } while (j >= i &&
10441 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010442 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010443 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010444
Victor Stinner12bab6d2011-10-01 01:53:49 +020010445 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446}
10447
10448PyObject*
10449PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10450{
10451 unsigned char *data;
10452 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010453 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454
Victor Stinner12bab6d2011-10-01 01:53:49 +020010455 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010457 if (PyUnicode_CheckExact(self)) {
10458 Py_INCREF(self);
10459 return self;
10460 }
10461 else
10462 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 }
10464
Victor Stinner12bab6d2011-10-01 01:53:49 +020010465 length = end - start;
10466 if (length == 1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 return unicode_getitem((PyUnicodeObject*)self, start);
10468
Victor Stinner12bab6d2011-10-01 01:53:49 +020010469 if (start < 0 || end < 0 || end > PyUnicode_GET_LENGTH(self)) {
10470 PyErr_SetString(PyExc_IndexError, "string index out of range");
10471 return NULL;
10472 }
10473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 if (PyUnicode_READY(self) == -1)
10475 return NULL;
10476 kind = PyUnicode_KIND(self);
10477 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010478 return PyUnicode_FromKindAndData(kind,
10479 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010480 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482
10483static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010484do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 int kind;
10487 void *data;
10488 Py_ssize_t len, i, j;
10489
10490 if (PyUnicode_READY(self) == -1)
10491 return NULL;
10492
10493 kind = PyUnicode_KIND(self);
10494 data = PyUnicode_DATA(self);
10495 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010496
Benjamin Peterson14339b62009-01-31 16:36:08 +000010497 i = 0;
10498 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010500 i++;
10501 }
10502 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010503
Benjamin Peterson14339b62009-01-31 16:36:08 +000010504 j = len;
10505 if (striptype != LEFTSTRIP) {
10506 do {
10507 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010509 j++;
10510 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010511
Victor Stinner12bab6d2011-10-01 01:53:49 +020010512 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010513}
10514
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010515
10516static PyObject *
10517do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10518{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010519 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010520
Benjamin Peterson14339b62009-01-31 16:36:08 +000010521 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10522 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010523
Benjamin Peterson14339b62009-01-31 16:36:08 +000010524 if (sep != NULL && sep != Py_None) {
10525 if (PyUnicode_Check(sep))
10526 return _PyUnicode_XStrip(self, striptype, sep);
10527 else {
10528 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010529 "%s arg must be None or str",
10530 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010531 return NULL;
10532 }
10533 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010534
Benjamin Peterson14339b62009-01-31 16:36:08 +000010535 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010536}
10537
10538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010539PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010540 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010541\n\
10542Return a copy of the string S with leading and trailing\n\
10543whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010544If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010545
10546static PyObject *
10547unicode_strip(PyUnicodeObject *self, PyObject *args)
10548{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010549 if (PyTuple_GET_SIZE(args) == 0)
10550 return do_strip(self, BOTHSTRIP); /* Common case */
10551 else
10552 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010553}
10554
10555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010556PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010557 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010558\n\
10559Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010560If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010561
10562static PyObject *
10563unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10564{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010565 if (PyTuple_GET_SIZE(args) == 0)
10566 return do_strip(self, LEFTSTRIP); /* Common case */
10567 else
10568 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010569}
10570
10571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010572PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010573 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010574\n\
10575Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010576If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010577
10578static PyObject *
10579unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10580{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010581 if (PyTuple_GET_SIZE(args) == 0)
10582 return do_strip(self, RIGHTSTRIP); /* Common case */
10583 else
10584 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010585}
10586
10587
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010589unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010590{
10591 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 Py_ssize_t nchars, n;
10593 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594
Georg Brandl222de0f2009-04-12 12:01:50 +000010595 if (len < 1) {
10596 Py_INCREF(unicode_empty);
10597 return (PyObject *)unicode_empty;
10598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599
Tim Peters7a29bd52001-09-12 03:03:31 +000010600 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601 /* no repeat, return original string */
10602 Py_INCREF(str);
10603 return (PyObject*) str;
10604 }
Tim Peters8f422462000-09-09 06:13:41 +000010605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 if (PyUnicode_READY(str) == -1)
10607 return NULL;
10608
Tim Peters8f422462000-09-09 06:13:41 +000010609 /* ensure # of chars needed doesn't overflow int and # of bytes
10610 * needed doesn't overflow size_t
10611 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 nchars = len * PyUnicode_GET_LENGTH(str);
10613 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010614 PyErr_SetString(PyExc_OverflowError,
10615 "repeated string is too long");
10616 return NULL;
10617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 char_size = PyUnicode_CHARACTER_SIZE(str);
10619 nbytes = (nchars + 1) * char_size;
10620 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010621 PyErr_SetString(PyExc_OverflowError,
10622 "repeated string is too long");
10623 return NULL;
10624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626 if (!u)
10627 return NULL;
10628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 if (PyUnicode_GET_LENGTH(str) == 1) {
10630 const int kind = PyUnicode_KIND(str);
10631 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10632 void *to = PyUnicode_DATA(u);
10633 for (n = 0; n < len; ++n)
10634 PyUnicode_WRITE(kind, to, n, fill_char);
10635 }
10636 else {
10637 /* number of characters copied this far */
10638 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10639 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10640 char *to = (char *) PyUnicode_DATA(u);
10641 Py_MEMCPY(to, PyUnicode_DATA(str),
10642 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010643 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 n = (done <= nchars-done) ? done : nchars-done;
10645 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010646 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010647 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648 }
10649
10650 return (PyObject*) u;
10651}
10652
Alexander Belopolsky40018472011-02-26 01:02:56 +000010653PyObject *
10654PyUnicode_Replace(PyObject *obj,
10655 PyObject *subobj,
10656 PyObject *replobj,
10657 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010658{
10659 PyObject *self;
10660 PyObject *str1;
10661 PyObject *str2;
10662 PyObject *result;
10663
10664 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010667 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010669 Py_DECREF(self);
10670 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671 }
10672 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010674 Py_DECREF(self);
10675 Py_DECREF(str1);
10676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679 Py_DECREF(self);
10680 Py_DECREF(str1);
10681 Py_DECREF(str2);
10682 return result;
10683}
10684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010685PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010686 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687\n\
10688Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010689old replaced by new. If the optional argument count is\n\
10690given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691
10692static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 PyObject *str1;
10696 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010697 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698 PyObject *result;
10699
Martin v. Löwis18e16552006-02-15 17:27:45 +000010700 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010703 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 str1 = PyUnicode_FromObject(str1);
10705 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10706 return NULL;
10707 str2 = PyUnicode_FromObject(str2);
10708 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010709 Py_DECREF(str1);
10710 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712
10713 result = replace(self, str1, str2, maxcount);
10714
10715 Py_DECREF(str1);
10716 Py_DECREF(str2);
10717 return result;
10718}
10719
Alexander Belopolsky40018472011-02-26 01:02:56 +000010720static PyObject *
10721unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010723 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 Py_ssize_t isize;
10725 Py_ssize_t osize, squote, dquote, i, o;
10726 Py_UCS4 max, quote;
10727 int ikind, okind;
10728 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010731 return NULL;
10732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 isize = PyUnicode_GET_LENGTH(unicode);
10734 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 /* Compute length of output, quote characters, and
10737 maximum character */
10738 osize = 2; /* quotes */
10739 max = 127;
10740 squote = dquote = 0;
10741 ikind = PyUnicode_KIND(unicode);
10742 for (i = 0; i < isize; i++) {
10743 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10744 switch (ch) {
10745 case '\'': squote++; osize++; break;
10746 case '"': dquote++; osize++; break;
10747 case '\\': case '\t': case '\r': case '\n':
10748 osize += 2; break;
10749 default:
10750 /* Fast-path ASCII */
10751 if (ch < ' ' || ch == 0x7f)
10752 osize += 4; /* \xHH */
10753 else if (ch < 0x7f)
10754 osize++;
10755 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10756 osize++;
10757 max = ch > max ? ch : max;
10758 }
10759 else if (ch < 0x100)
10760 osize += 4; /* \xHH */
10761 else if (ch < 0x10000)
10762 osize += 6; /* \uHHHH */
10763 else
10764 osize += 10; /* \uHHHHHHHH */
10765 }
10766 }
10767
10768 quote = '\'';
10769 if (squote) {
10770 if (dquote)
10771 /* Both squote and dquote present. Use squote,
10772 and escape them */
10773 osize += squote;
10774 else
10775 quote = '"';
10776 }
10777
10778 repr = PyUnicode_New(osize, max);
10779 if (repr == NULL)
10780 return NULL;
10781 okind = PyUnicode_KIND(repr);
10782 odata = PyUnicode_DATA(repr);
10783
10784 PyUnicode_WRITE(okind, odata, 0, quote);
10785 PyUnicode_WRITE(okind, odata, osize-1, quote);
10786
10787 for (i = 0, o = 1; i < isize; i++) {
10788 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010789
10790 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 if ((ch == quote) || (ch == '\\')) {
10792 PyUnicode_WRITE(okind, odata, o++, '\\');
10793 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010794 continue;
10795 }
10796
Benjamin Peterson29060642009-01-31 22:14:21 +000010797 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010798 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 PyUnicode_WRITE(okind, odata, o++, '\\');
10800 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010801 }
10802 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 PyUnicode_WRITE(okind, odata, o++, '\\');
10804 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010805 }
10806 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 PyUnicode_WRITE(okind, odata, o++, '\\');
10808 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010809 }
10810
10811 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010812 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 PyUnicode_WRITE(okind, odata, o++, '\\');
10814 PyUnicode_WRITE(okind, odata, o++, 'x');
10815 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10816 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010817 }
10818
Georg Brandl559e5d72008-06-11 18:37:52 +000010819 /* Copy ASCII characters as-is */
10820 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010822 }
10823
Benjamin Peterson29060642009-01-31 22:14:21 +000010824 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010825 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010826 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010827 (categories Z* and C* except ASCII space)
10828 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010830 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 if (ch <= 0xff) {
10832 PyUnicode_WRITE(okind, odata, o++, '\\');
10833 PyUnicode_WRITE(okind, odata, o++, 'x');
10834 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10835 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010836 }
10837 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 else if (ch >= 0x10000) {
10839 PyUnicode_WRITE(okind, odata, o++, '\\');
10840 PyUnicode_WRITE(okind, odata, o++, 'U');
10841 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10842 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10843 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10844 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10845 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10846 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10847 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10848 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010849 }
10850 /* Map 16-bit characters to '\uxxxx' */
10851 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010852 PyUnicode_WRITE(okind, odata, o++, '\\');
10853 PyUnicode_WRITE(okind, odata, o++, 'u');
10854 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10855 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10856 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10857 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010858 }
10859 }
10860 /* Copy characters as-is */
10861 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010863 }
10864 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010867 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868}
10869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010870PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872\n\
10873Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010874such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875arguments start and end are interpreted as in slice notation.\n\
10876\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010877Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878
10879static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010881{
Jesus Ceaac451502011-04-20 17:09:23 +020010882 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010883 Py_ssize_t start;
10884 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010885 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886
Jesus Ceaac451502011-04-20 17:09:23 +020010887 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10888 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010889 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 if (PyUnicode_READY(self) == -1)
10892 return NULL;
10893 if (PyUnicode_READY(substring) == -1)
10894 return NULL;
10895
10896 result = any_find_slice(
10897 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10898 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010899 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010900
10901 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903 if (result == -2)
10904 return NULL;
10905
Christian Heimes217cfd12007-12-02 14:31:20 +000010906 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907}
10908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010909PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010910 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010912Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913
10914static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010915unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916{
Jesus Ceaac451502011-04-20 17:09:23 +020010917 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010918 Py_ssize_t start;
10919 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010920 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010921
Jesus Ceaac451502011-04-20 17:09:23 +020010922 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10923 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010926 if (PyUnicode_READY(self) == -1)
10927 return NULL;
10928 if (PyUnicode_READY(substring) == -1)
10929 return NULL;
10930
10931 result = any_find_slice(
10932 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10933 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010934 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935
10936 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 if (result == -2)
10939 return NULL;
10940
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941 if (result < 0) {
10942 PyErr_SetString(PyExc_ValueError, "substring not found");
10943 return NULL;
10944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945
Christian Heimes217cfd12007-12-02 14:31:20 +000010946 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947}
10948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010949PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010950 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010952Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010953done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954
10955static PyObject *
10956unicode_rjust(PyUnicodeObject *self, PyObject *args)
10957{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010958 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 Py_UCS4 fillchar = ' ';
10960
10961 if (PyUnicode_READY(self) == -1)
10962 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010963
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010964 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965 return NULL;
10966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968 Py_INCREF(self);
10969 return (PyObject*) self;
10970 }
10971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973}
10974
Alexander Belopolsky40018472011-02-26 01:02:56 +000010975PyObject *
10976PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977{
10978 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010979
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980 s = PyUnicode_FromObject(s);
10981 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010982 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010983 if (sep != NULL) {
10984 sep = PyUnicode_FromObject(sep);
10985 if (sep == NULL) {
10986 Py_DECREF(s);
10987 return NULL;
10988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989 }
10990
10991 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10992
10993 Py_DECREF(s);
10994 Py_XDECREF(sep);
10995 return result;
10996}
10997
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010998PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010999 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000\n\
11001Return a list of the words in S, using sep as the\n\
11002delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011003splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011004whitespace string is a separator and empty strings are\n\
11005removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006
11007static PyObject*
11008unicode_split(PyUnicodeObject *self, PyObject *args)
11009{
11010 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011011 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012
Martin v. Löwis18e16552006-02-15 17:27:45 +000011013 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014 return NULL;
11015
11016 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011017 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011019 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011021 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022}
11023
Thomas Wouters477c8d52006-05-27 19:21:47 +000011024PyObject *
11025PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11026{
11027 PyObject* str_obj;
11028 PyObject* sep_obj;
11029 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 int kind1, kind2, kind;
11031 void *buf1 = NULL, *buf2 = NULL;
11032 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011033
11034 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011036 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011037 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011039 Py_DECREF(str_obj);
11040 return NULL;
11041 }
11042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011043 kind1 = PyUnicode_KIND(str_in);
11044 kind2 = PyUnicode_KIND(sep_obj);
11045 kind = kind1 > kind2 ? kind1 : kind2;
11046 buf1 = PyUnicode_DATA(str_in);
11047 if (kind1 != kind)
11048 buf1 = _PyUnicode_AsKind(str_in, kind);
11049 if (!buf1)
11050 goto onError;
11051 buf2 = PyUnicode_DATA(sep_obj);
11052 if (kind2 != kind)
11053 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11054 if (!buf2)
11055 goto onError;
11056 len1 = PyUnicode_GET_LENGTH(str_obj);
11057 len2 = PyUnicode_GET_LENGTH(sep_obj);
11058
11059 switch(PyUnicode_KIND(str_in)) {
11060 case PyUnicode_1BYTE_KIND:
11061 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11062 break;
11063 case PyUnicode_2BYTE_KIND:
11064 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11065 break;
11066 case PyUnicode_4BYTE_KIND:
11067 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11068 break;
11069 default:
11070 assert(0);
11071 out = 0;
11072 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011073
11074 Py_DECREF(sep_obj);
11075 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 if (kind1 != kind)
11077 PyMem_Free(buf1);
11078 if (kind2 != kind)
11079 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011080
11081 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 onError:
11083 Py_DECREF(sep_obj);
11084 Py_DECREF(str_obj);
11085 if (kind1 != kind && buf1)
11086 PyMem_Free(buf1);
11087 if (kind2 != kind && buf2)
11088 PyMem_Free(buf2);
11089 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011090}
11091
11092
11093PyObject *
11094PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11095{
11096 PyObject* str_obj;
11097 PyObject* sep_obj;
11098 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099 int kind1, kind2, kind;
11100 void *buf1 = NULL, *buf2 = NULL;
11101 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011102
11103 str_obj = PyUnicode_FromObject(str_in);
11104 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011105 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011106 sep_obj = PyUnicode_FromObject(sep_in);
11107 if (!sep_obj) {
11108 Py_DECREF(str_obj);
11109 return NULL;
11110 }
11111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112 kind1 = PyUnicode_KIND(str_in);
11113 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011114 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 buf1 = PyUnicode_DATA(str_in);
11116 if (kind1 != kind)
11117 buf1 = _PyUnicode_AsKind(str_in, kind);
11118 if (!buf1)
11119 goto onError;
11120 buf2 = PyUnicode_DATA(sep_obj);
11121 if (kind2 != kind)
11122 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11123 if (!buf2)
11124 goto onError;
11125 len1 = PyUnicode_GET_LENGTH(str_obj);
11126 len2 = PyUnicode_GET_LENGTH(sep_obj);
11127
11128 switch(PyUnicode_KIND(str_in)) {
11129 case PyUnicode_1BYTE_KIND:
11130 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11131 break;
11132 case PyUnicode_2BYTE_KIND:
11133 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11134 break;
11135 case PyUnicode_4BYTE_KIND:
11136 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11137 break;
11138 default:
11139 assert(0);
11140 out = 0;
11141 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011142
11143 Py_DECREF(sep_obj);
11144 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011145 if (kind1 != kind)
11146 PyMem_Free(buf1);
11147 if (kind2 != kind)
11148 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011149
11150 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 onError:
11152 Py_DECREF(sep_obj);
11153 Py_DECREF(str_obj);
11154 if (kind1 != kind && buf1)
11155 PyMem_Free(buf1);
11156 if (kind2 != kind && buf2)
11157 PyMem_Free(buf2);
11158 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011159}
11160
11161PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011162 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011163\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011164Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011165the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011166found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011167
11168static PyObject*
11169unicode_partition(PyUnicodeObject *self, PyObject *separator)
11170{
11171 return PyUnicode_Partition((PyObject *)self, separator);
11172}
11173
11174PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011175 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011176\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011177Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011178the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011179separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011180
11181static PyObject*
11182unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11183{
11184 return PyUnicode_RPartition((PyObject *)self, separator);
11185}
11186
Alexander Belopolsky40018472011-02-26 01:02:56 +000011187PyObject *
11188PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011189{
11190 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011191
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011192 s = PyUnicode_FromObject(s);
11193 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011194 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011195 if (sep != NULL) {
11196 sep = PyUnicode_FromObject(sep);
11197 if (sep == NULL) {
11198 Py_DECREF(s);
11199 return NULL;
11200 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011201 }
11202
11203 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11204
11205 Py_DECREF(s);
11206 Py_XDECREF(sep);
11207 return result;
11208}
11209
11210PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011211 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011212\n\
11213Return a list of the words in S, using sep as the\n\
11214delimiter string, starting at the end of the string and\n\
11215working to the front. If maxsplit is given, at most maxsplit\n\
11216splits are done. If sep is not specified, any whitespace string\n\
11217is a separator.");
11218
11219static PyObject*
11220unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11221{
11222 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011223 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011224
Martin v. Löwis18e16552006-02-15 17:27:45 +000011225 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011226 return NULL;
11227
11228 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011229 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011230 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011231 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011232 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011233 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011234}
11235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011236PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238\n\
11239Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011240Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011241is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242
11243static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011244unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011246 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011247 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011249 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11250 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251 return NULL;
11252
Guido van Rossum86662912000-04-11 15:38:46 +000011253 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254}
11255
11256static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011257PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258{
Walter Dörwald346737f2007-05-31 10:44:43 +000011259 if (PyUnicode_CheckExact(self)) {
11260 Py_INCREF(self);
11261 return self;
11262 } else
11263 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011264 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265}
11266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011267PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269\n\
11270Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011271and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272
11273static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011274unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 return fixup(self, fixswapcase);
11277}
11278
Georg Brandlceee0772007-11-27 23:48:05 +000011279PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011280 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011281\n\
11282Return a translation table usable for str.translate().\n\
11283If there is only one argument, it must be a dictionary mapping Unicode\n\
11284ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011285Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011286If there are two arguments, they must be strings of equal length, and\n\
11287in the resulting dictionary, each character in x will be mapped to the\n\
11288character at the same position in y. If there is a third argument, it\n\
11289must be a string, whose characters will be mapped to None in the result.");
11290
11291static PyObject*
11292unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11293{
11294 PyObject *x, *y = NULL, *z = NULL;
11295 PyObject *new = NULL, *key, *value;
11296 Py_ssize_t i = 0;
11297 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011298
Georg Brandlceee0772007-11-27 23:48:05 +000011299 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11300 return NULL;
11301 new = PyDict_New();
11302 if (!new)
11303 return NULL;
11304 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305 int x_kind, y_kind, z_kind;
11306 void *x_data, *y_data, *z_data;
11307
Georg Brandlceee0772007-11-27 23:48:05 +000011308 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011309 if (!PyUnicode_Check(x)) {
11310 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11311 "be a string if there is a second argument");
11312 goto err;
11313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011315 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11316 "arguments must have equal length");
11317 goto err;
11318 }
11319 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 x_kind = PyUnicode_KIND(x);
11321 y_kind = PyUnicode_KIND(y);
11322 x_data = PyUnicode_DATA(x);
11323 y_data = PyUnicode_DATA(y);
11324 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11325 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11326 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011327 if (!key || !value)
11328 goto err;
11329 res = PyDict_SetItem(new, key, value);
11330 Py_DECREF(key);
11331 Py_DECREF(value);
11332 if (res < 0)
11333 goto err;
11334 }
11335 /* create entries for deleting chars in z */
11336 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 z_kind = PyUnicode_KIND(z);
11338 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011339 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011341 if (!key)
11342 goto err;
11343 res = PyDict_SetItem(new, key, Py_None);
11344 Py_DECREF(key);
11345 if (res < 0)
11346 goto err;
11347 }
11348 }
11349 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350 int kind;
11351 void *data;
11352
Georg Brandlceee0772007-11-27 23:48:05 +000011353 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011354 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011355 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11356 "to maketrans it must be a dict");
11357 goto err;
11358 }
11359 /* copy entries into the new dict, converting string keys to int keys */
11360 while (PyDict_Next(x, &i, &key, &value)) {
11361 if (PyUnicode_Check(key)) {
11362 /* convert string keys to integer keys */
11363 PyObject *newkey;
11364 if (PyUnicode_GET_SIZE(key) != 1) {
11365 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11366 "table must be of length 1");
11367 goto err;
11368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 kind = PyUnicode_KIND(key);
11370 data = PyUnicode_DATA(key);
11371 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011372 if (!newkey)
11373 goto err;
11374 res = PyDict_SetItem(new, newkey, value);
11375 Py_DECREF(newkey);
11376 if (res < 0)
11377 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011378 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011379 /* just keep integer keys */
11380 if (PyDict_SetItem(new, key, value) < 0)
11381 goto err;
11382 } else {
11383 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11384 "be strings or integers");
11385 goto err;
11386 }
11387 }
11388 }
11389 return new;
11390 err:
11391 Py_DECREF(new);
11392 return NULL;
11393}
11394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011395PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011396 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397\n\
11398Return a copy of the string S, where all characters have been mapped\n\
11399through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011400Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011401Unmapped characters are left untouched. Characters mapped to None\n\
11402are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403
11404static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408}
11409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011413Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414
11415static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011416unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418 return fixup(self, fixupper);
11419}
11420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011421PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011422 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011424Pad a numeric string S with zeros on the left, to fill a field\n\
11425of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426
11427static PyObject *
11428unicode_zfill(PyUnicodeObject *self, PyObject *args)
11429{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011430 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011432 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 int kind;
11434 void *data;
11435 Py_UCS4 chr;
11436
11437 if (PyUnicode_READY(self) == -1)
11438 return NULL;
11439
Martin v. Löwis18e16552006-02-15 17:27:45 +000011440 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441 return NULL;
11442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011444 if (PyUnicode_CheckExact(self)) {
11445 Py_INCREF(self);
11446 return (PyObject*) self;
11447 }
11448 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011449 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450 }
11451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453
11454 u = pad(self, fill, 0, '0');
11455
Walter Dörwald068325e2002-04-15 13:36:47 +000011456 if (u == NULL)
11457 return NULL;
11458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 kind = PyUnicode_KIND(u);
11460 data = PyUnicode_DATA(u);
11461 chr = PyUnicode_READ(kind, data, fill);
11462
11463 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 PyUnicode_WRITE(kind, data, 0, chr);
11466 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467 }
11468
11469 return (PyObject*) u;
11470}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471
11472#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011473static PyObject *
11474unicode__decimal2ascii(PyObject *self)
11475{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011477}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478#endif
11479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011480PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011483Return True if S starts with the specified prefix, False otherwise.\n\
11484With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011485With optional end, stop comparing S at that position.\n\
11486prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
11488static PyObject *
11489unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011492 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011494 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011495 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011496 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497
Jesus Ceaac451502011-04-20 17:09:23 +020011498 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011500 if (PyTuple_Check(subobj)) {
11501 Py_ssize_t i;
11502 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11503 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011504 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011505 if (substring == NULL)
11506 return NULL;
11507 result = tailmatch(self, substring, start, end, -1);
11508 Py_DECREF(substring);
11509 if (result) {
11510 Py_RETURN_TRUE;
11511 }
11512 }
11513 /* nothing matched */
11514 Py_RETURN_FALSE;
11515 }
11516 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011517 if (substring == NULL) {
11518 if (PyErr_ExceptionMatches(PyExc_TypeError))
11519 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11520 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011521 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011522 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011523 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011525 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526}
11527
11528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011529PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011530 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011532Return True if S ends with the specified suffix, False otherwise.\n\
11533With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011534With optional end, stop comparing S at that position.\n\
11535suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
11537static PyObject *
11538unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011539 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011541 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011543 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011544 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011545 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
Jesus Ceaac451502011-04-20 17:09:23 +020011547 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011548 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011549 if (PyTuple_Check(subobj)) {
11550 Py_ssize_t i;
11551 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11552 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011554 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011556 result = tailmatch(self, substring, start, end, +1);
11557 Py_DECREF(substring);
11558 if (result) {
11559 Py_RETURN_TRUE;
11560 }
11561 }
11562 Py_RETURN_FALSE;
11563 }
11564 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011565 if (substring == NULL) {
11566 if (PyErr_ExceptionMatches(PyExc_TypeError))
11567 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11568 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011569 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011570 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011571 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011573 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574}
11575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011577
11578PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011579 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011580\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011581Return a formatted version of S, using substitutions from args and kwargs.\n\
11582The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011583
Eric Smith27bbca62010-11-04 17:06:58 +000011584PyDoc_STRVAR(format_map__doc__,
11585 "S.format_map(mapping) -> str\n\
11586\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011587Return a formatted version of S, using substitutions from mapping.\n\
11588The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011589
Eric Smith4a7d76d2008-05-30 18:10:19 +000011590static PyObject *
11591unicode__format__(PyObject* self, PyObject* args)
11592{
11593 PyObject *format_spec;
11594
11595 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11596 return NULL;
11597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11599 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011600}
11601
Eric Smith8c663262007-08-25 02:26:07 +000011602PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011603 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011604\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011605Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011606
11607static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011608unicode__sizeof__(PyUnicodeObject *v)
11609{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 Py_ssize_t size;
11611
11612 /* If it's a compact object, account for base structure +
11613 character data. */
11614 if (PyUnicode_IS_COMPACT_ASCII(v))
11615 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11616 else if (PyUnicode_IS_COMPACT(v))
11617 size = sizeof(PyCompactUnicodeObject) +
11618 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11619 else {
11620 /* If it is a two-block object, account for base object, and
11621 for character block if present. */
11622 size = sizeof(PyUnicodeObject);
11623 if (v->data.any)
11624 size += (PyUnicode_GET_LENGTH(v) + 1) *
11625 PyUnicode_CHARACTER_SIZE(v);
11626 }
11627 /* If the wstr pointer is present, account for it unless it is shared
11628 with the data pointer. Since PyUnicode_DATA will crash if the object
11629 is not ready, check whether it's either not ready (in which case the
11630 data is entirely in wstr) or if the data is not shared. */
11631 if (_PyUnicode_WSTR(v) &&
11632 (!PyUnicode_IS_READY(v) ||
11633 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11634 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11635 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11636 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11637
11638 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011639}
11640
11641PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011643
11644static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011645unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011646{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011647 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011648 if (!copy)
11649 return NULL;
11650 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011651}
11652
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653static PyMethodDef unicode_methods[] = {
11654
11655 /* Order is according to common usage: often used methods should
11656 appear first, since lookup is done sequentially. */
11657
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011658 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011659 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11660 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011661 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011662 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11663 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11664 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11665 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11666 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11667 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11668 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011669 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011670 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11671 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11672 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011673 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011674 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11675 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11676 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011677 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011678 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011679 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011680 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011681 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11682 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11683 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11684 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11685 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11686 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11687 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11688 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11689 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11690 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11691 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11692 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11693 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11694 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011695 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011696 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011697 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011698 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011699 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011700 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011701 {"maketrans", (PyCFunction) unicode_maketrans,
11702 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011703 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011704#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011705 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706#endif
11707
11708#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011709 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011710 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711#endif
11712
Benjamin Peterson14339b62009-01-31 16:36:08 +000011713 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714 {NULL, NULL}
11715};
11716
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011717static PyObject *
11718unicode_mod(PyObject *v, PyObject *w)
11719{
Brian Curtindfc80e32011-08-10 20:28:54 -050011720 if (!PyUnicode_Check(v))
11721 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011722 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011723}
11724
11725static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011726 0, /*nb_add*/
11727 0, /*nb_subtract*/
11728 0, /*nb_multiply*/
11729 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011730};
11731
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011733 (lenfunc) unicode_length, /* sq_length */
11734 PyUnicode_Concat, /* sq_concat */
11735 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11736 (ssizeargfunc) unicode_getitem, /* sq_item */
11737 0, /* sq_slice */
11738 0, /* sq_ass_item */
11739 0, /* sq_ass_slice */
11740 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741};
11742
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011743static PyObject*
11744unicode_subscript(PyUnicodeObject* self, PyObject* item)
11745{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 if (PyUnicode_READY(self) == -1)
11747 return NULL;
11748
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011749 if (PyIndex_Check(item)) {
11750 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011751 if (i == -1 && PyErr_Occurred())
11752 return NULL;
11753 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011755 return unicode_getitem(self, i);
11756 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011757 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011759 Py_UNICODE* result_buf;
11760 PyObject* result;
11761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011764 return NULL;
11765 }
11766
11767 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 return PyUnicode_New(0, 0);
11769 } else if (start == 0 && step == 1 &&
11770 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011771 PyUnicode_CheckExact(self)) {
11772 Py_INCREF(self);
11773 return (PyObject *)self;
11774 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011775 return PyUnicode_Substring((PyObject*)self,
11776 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011777 } else {
11778 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011779 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11780 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011781
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 if (result_buf == NULL)
11783 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011784
11785 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11786 result_buf[i] = source_buf[cur];
11787 }
Tim Petersced69f82003-09-16 20:30:58 +000011788
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011789 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011790 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011791 return result;
11792 }
11793 } else {
11794 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11795 return NULL;
11796 }
11797}
11798
11799static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011800 (lenfunc)unicode_length, /* mp_length */
11801 (binaryfunc)unicode_subscript, /* mp_subscript */
11802 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011803};
11804
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806/* Helpers for PyUnicode_Format() */
11807
11808static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011809getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011811 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011813 (*p_argidx)++;
11814 if (arglen < 0)
11815 return args;
11816 else
11817 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818 }
11819 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 return NULL;
11822}
11823
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011824/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011826static PyObject *
11827formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011829 char *p;
11830 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011832
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833 x = PyFloat_AsDouble(v);
11834 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011835 return NULL;
11836
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011839
Eric Smith0923d1d2009-04-16 20:16:10 +000011840 p = PyOS_double_to_string(x, type, prec,
11841 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011842 if (p == NULL)
11843 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011845 PyMem_Free(p);
11846 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847}
11848
Tim Peters38fd5b62000-09-21 05:43:11 +000011849static PyObject*
11850formatlong(PyObject *val, int flags, int prec, int type)
11851{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011852 char *buf;
11853 int len;
11854 PyObject *str; /* temporary string object. */
11855 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011856
Benjamin Peterson14339b62009-01-31 16:36:08 +000011857 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11858 if (!str)
11859 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011861 Py_DECREF(str);
11862 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011863}
11864
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011867 size_t buflen,
11868 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011870 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011871 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 if (PyUnicode_GET_LENGTH(v) == 1) {
11873 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011874 buf[1] = '\0';
11875 return 1;
11876 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 goto onError;
11878 }
11879 else {
11880 /* Integer input truncated to a character */
11881 long x;
11882 x = PyLong_AsLong(v);
11883 if (x == -1 && PyErr_Occurred())
11884 goto onError;
11885
11886 if (x < 0 || x > 0x10ffff) {
11887 PyErr_SetString(PyExc_OverflowError,
11888 "%c arg not in range(0x110000)");
11889 return -1;
11890 }
11891
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011893 buf[1] = '\0';
11894 return 1;
11895 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011896
Benjamin Peterson29060642009-01-31 22:14:21 +000011897 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011898 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011899 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011900 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901}
11902
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011903/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011904 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011905*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011906#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011907
Alexander Belopolsky40018472011-02-26 01:02:56 +000011908PyObject *
11909PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 void *fmt;
11912 int fmtkind;
11913 PyObject *result;
11914 Py_UCS4 *res, *res0;
11915 Py_UCS4 max;
11916 int kind;
11917 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011921
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011923 PyErr_BadInternalCall();
11924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11927 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011928 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 fmt = PyUnicode_DATA(uformat);
11930 fmtkind = PyUnicode_KIND(uformat);
11931 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11932 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933
11934 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11936 if (res0 == NULL) {
11937 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940
11941 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 arglen = PyTuple_Size(args);
11943 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944 }
11945 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 arglen = -1;
11947 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011949 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011950 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011951 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952
11953 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 if (--rescnt < 0) {
11956 rescnt = fmtcnt + 100;
11957 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11959 if (res0 == NULL){
11960 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011961 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 }
11963 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011967 }
11968 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 /* Got a format specifier */
11970 int flags = 0;
11971 Py_ssize_t width = -1;
11972 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 Py_UCS4 c = '\0';
11974 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011975 int isnumok;
11976 PyObject *v = NULL;
11977 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 void *pbuf;
11979 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011980 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 Py_ssize_t len, len1;
11982 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 fmtpos++;
11985 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
11986 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000011987 Py_ssize_t keylen;
11988 PyObject *key;
11989 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000011990
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 if (dict == NULL) {
11992 PyErr_SetString(PyExc_TypeError,
11993 "format requires a mapping");
11994 goto onError;
11995 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 /* Skip over balanced parentheses */
12000 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012008 if (fmtcnt < 0 || pcount > 0) {
12009 PyErr_SetString(PyExc_ValueError,
12010 "incomplete format key");
12011 goto onError;
12012 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012013 key = PyUnicode_Substring((PyObject*)uformat,
12014 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012015 if (key == NULL)
12016 goto onError;
12017 if (args_owned) {
12018 Py_DECREF(args);
12019 args_owned = 0;
12020 }
12021 args = PyObject_GetItem(dict, key);
12022 Py_DECREF(key);
12023 if (args == NULL) {
12024 goto onError;
12025 }
12026 args_owned = 1;
12027 arglen = -1;
12028 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012029 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 case '-': flags |= F_LJUST; continue;
12033 case '+': flags |= F_SIGN; continue;
12034 case ' ': flags |= F_BLANK; continue;
12035 case '#': flags |= F_ALT; continue;
12036 case '0': flags |= F_ZERO; continue;
12037 }
12038 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012039 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012040 if (c == '*') {
12041 v = getnextarg(args, arglen, &argidx);
12042 if (v == NULL)
12043 goto onError;
12044 if (!PyLong_Check(v)) {
12045 PyErr_SetString(PyExc_TypeError,
12046 "* wants int");
12047 goto onError;
12048 }
12049 width = PyLong_AsLong(v);
12050 if (width == -1 && PyErr_Occurred())
12051 goto onError;
12052 if (width < 0) {
12053 flags |= F_LJUST;
12054 width = -width;
12055 }
12056 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012058 }
12059 else if (c >= '0' && c <= '9') {
12060 width = c - '0';
12061 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012063 if (c < '0' || c > '9')
12064 break;
12065 if ((width*10) / 10 != width) {
12066 PyErr_SetString(PyExc_ValueError,
12067 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012068 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 }
12070 width = width*10 + (c - '0');
12071 }
12072 }
12073 if (c == '.') {
12074 prec = 0;
12075 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012077 if (c == '*') {
12078 v = getnextarg(args, arglen, &argidx);
12079 if (v == NULL)
12080 goto onError;
12081 if (!PyLong_Check(v)) {
12082 PyErr_SetString(PyExc_TypeError,
12083 "* wants int");
12084 goto onError;
12085 }
12086 prec = PyLong_AsLong(v);
12087 if (prec == -1 && PyErr_Occurred())
12088 goto onError;
12089 if (prec < 0)
12090 prec = 0;
12091 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012093 }
12094 else if (c >= '0' && c <= '9') {
12095 prec = c - '0';
12096 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 if (c < '0' || c > '9')
12099 break;
12100 if ((prec*10) / 10 != prec) {
12101 PyErr_SetString(PyExc_ValueError,
12102 "prec too big");
12103 goto onError;
12104 }
12105 prec = prec*10 + (c - '0');
12106 }
12107 }
12108 } /* prec */
12109 if (fmtcnt >= 0) {
12110 if (c == 'h' || c == 'l' || c == 'L') {
12111 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012113 }
12114 }
12115 if (fmtcnt < 0) {
12116 PyErr_SetString(PyExc_ValueError,
12117 "incomplete format");
12118 goto onError;
12119 }
12120 if (c != '%') {
12121 v = getnextarg(args, arglen, &argidx);
12122 if (v == NULL)
12123 goto onError;
12124 }
12125 sign = 0;
12126 fill = ' ';
12127 switch (c) {
12128
12129 case '%':
12130 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012132 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 len = 1;
12135 break;
12136
12137 case 's':
12138 case 'r':
12139 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012140 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012141 temp = v;
12142 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012143 }
12144 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 if (c == 's')
12146 temp = PyObject_Str(v);
12147 else if (c == 'r')
12148 temp = PyObject_Repr(v);
12149 else
12150 temp = PyObject_ASCII(v);
12151 if (temp == NULL)
12152 goto onError;
12153 if (PyUnicode_Check(temp))
12154 /* nothing to do */;
12155 else {
12156 Py_DECREF(temp);
12157 PyErr_SetString(PyExc_TypeError,
12158 "%s argument has non-string str()");
12159 goto onError;
12160 }
12161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 if (PyUnicode_READY(temp) == -1) {
12163 Py_CLEAR(temp);
12164 goto onError;
12165 }
12166 pbuf = PyUnicode_DATA(temp);
12167 kind = PyUnicode_KIND(temp);
12168 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012169 if (prec >= 0 && len > prec)
12170 len = prec;
12171 break;
12172
12173 case 'i':
12174 case 'd':
12175 case 'u':
12176 case 'o':
12177 case 'x':
12178 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012179 isnumok = 0;
12180 if (PyNumber_Check(v)) {
12181 PyObject *iobj=NULL;
12182
12183 if (PyLong_Check(v)) {
12184 iobj = v;
12185 Py_INCREF(iobj);
12186 }
12187 else {
12188 iobj = PyNumber_Long(v);
12189 }
12190 if (iobj!=NULL) {
12191 if (PyLong_Check(iobj)) {
12192 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012193 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012194 Py_DECREF(iobj);
12195 if (!temp)
12196 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 if (PyUnicode_READY(temp) == -1) {
12198 Py_CLEAR(temp);
12199 goto onError;
12200 }
12201 pbuf = PyUnicode_DATA(temp);
12202 kind = PyUnicode_KIND(temp);
12203 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012204 sign = 1;
12205 }
12206 else {
12207 Py_DECREF(iobj);
12208 }
12209 }
12210 }
12211 if (!isnumok) {
12212 PyErr_Format(PyExc_TypeError,
12213 "%%%c format: a number is required, "
12214 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12215 goto onError;
12216 }
12217 if (flags & F_ZERO)
12218 fill = '0';
12219 break;
12220
12221 case 'e':
12222 case 'E':
12223 case 'f':
12224 case 'F':
12225 case 'g':
12226 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012227 temp = formatfloat(v, flags, prec, c);
12228 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012229 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 if (PyUnicode_READY(temp) == -1) {
12231 Py_CLEAR(temp);
12232 goto onError;
12233 }
12234 pbuf = PyUnicode_DATA(temp);
12235 kind = PyUnicode_KIND(temp);
12236 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012237 sign = 1;
12238 if (flags & F_ZERO)
12239 fill = '0';
12240 break;
12241
12242 case 'c':
12243 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012245 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 if (len < 0)
12247 goto onError;
12248 break;
12249
12250 default:
12251 PyErr_Format(PyExc_ValueError,
12252 "unsupported format character '%c' (0x%x) "
12253 "at index %zd",
12254 (31<=c && c<=126) ? (char)c : '?',
12255 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012257 goto onError;
12258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 /* pbuf is initialized here. */
12260 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012261 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12263 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12264 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012265 len--;
12266 }
12267 else if (flags & F_SIGN)
12268 sign = '+';
12269 else if (flags & F_BLANK)
12270 sign = ' ';
12271 else
12272 sign = 0;
12273 }
12274 if (width < len)
12275 width = len;
12276 if (rescnt - (sign != 0) < width) {
12277 reslen -= rescnt;
12278 rescnt = width + fmtcnt + 100;
12279 reslen += rescnt;
12280 if (reslen < 0) {
12281 Py_XDECREF(temp);
12282 PyErr_NoMemory();
12283 goto onError;
12284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12286 if (res0 == 0) {
12287 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 Py_XDECREF(temp);
12289 goto onError;
12290 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012292 }
12293 if (sign) {
12294 if (fill != ' ')
12295 *res++ = sign;
12296 rescnt--;
12297 if (width > len)
12298 width--;
12299 }
12300 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12302 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12305 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012306 }
12307 rescnt -= 2;
12308 width -= 2;
12309 if (width < 0)
12310 width = 0;
12311 len -= 2;
12312 }
12313 if (width > len && !(flags & F_LJUST)) {
12314 do {
12315 --rescnt;
12316 *res++ = fill;
12317 } while (--width > len);
12318 }
12319 if (fill == ' ') {
12320 if (sign)
12321 *res++ = sign;
12322 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12324 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12325 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12326 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012327 }
12328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 /* Copy all characters, preserving len */
12330 len1 = len;
12331 while (len1--) {
12332 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12333 rescnt--;
12334 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 while (--width >= len) {
12336 --rescnt;
12337 *res++ = ' ';
12338 }
12339 if (dict && (argidx < arglen) && c != '%') {
12340 PyErr_SetString(PyExc_TypeError,
12341 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012342 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 goto onError;
12344 }
12345 Py_XDECREF(temp);
12346 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347 } /* until end */
12348 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012349 PyErr_SetString(PyExc_TypeError,
12350 "not all arguments converted during string formatting");
12351 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352 }
12353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354
12355 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12356 if (*res > max)
12357 max = *res;
12358 result = PyUnicode_New(reslen - rescnt, max);
12359 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012360 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 kind = PyUnicode_KIND(result);
12362 for (res = res0; res < res0+reslen-rescnt; res++)
12363 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12364 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012366 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012367 }
12368 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369 return (PyObject *)result;
12370
Benjamin Peterson29060642009-01-31 22:14:21 +000012371 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373 Py_DECREF(uformat);
12374 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376 }
12377 return NULL;
12378}
12379
Jeremy Hylton938ace62002-07-17 16:30:39 +000012380static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012381unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12382
Tim Peters6d6c1a32001-08-02 04:15:00 +000012383static PyObject *
12384unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12385{
Benjamin Peterson29060642009-01-31 22:14:21 +000012386 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012387 static char *kwlist[] = {"object", "encoding", "errors", 0};
12388 char *encoding = NULL;
12389 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012390
Benjamin Peterson14339b62009-01-31 16:36:08 +000012391 if (type != &PyUnicode_Type)
12392 return unicode_subtype_new(type, args, kwds);
12393 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012395 return NULL;
12396 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012398 if (encoding == NULL && errors == NULL)
12399 return PyObject_Str(x);
12400 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012401 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012402}
12403
Guido van Rossume023fe02001-08-30 03:12:59 +000012404static PyObject *
12405unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12406{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012407 PyUnicodeObject *tmp, *pnew;
12408 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012410
Benjamin Peterson14339b62009-01-31 16:36:08 +000012411 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12412 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12413 if (tmp == NULL)
12414 return NULL;
12415 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12417 // it seems kind of strange that tp_alloc gets passed the size
12418 // of the unicode string because there will follow another
12419 // malloc.
12420 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12421 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012422 if (pnew == NULL) {
12423 Py_DECREF(tmp);
12424 return NULL;
12425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12427 if (_PyUnicode_WSTR(pnew) == NULL) {
12428 err = PyErr_NoMemory();
12429 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012430 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12432 _PyUnicode_WSTR_LENGTH(pnew) = n;
12433 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12434 _PyUnicode_STATE(pnew).interned = 0;
12435 _PyUnicode_STATE(pnew).kind = 0;
12436 _PyUnicode_STATE(pnew).compact = 0;
12437 _PyUnicode_STATE(pnew).ready = 0;
12438 _PyUnicode_STATE(pnew).ascii = 0;
12439 pnew->data.any = NULL;
12440 _PyUnicode_LENGTH(pnew) = 0;
12441 pnew->_base.utf8 = NULL;
12442 pnew->_base.utf8_length = 0;
12443
12444 if (PyUnicode_READY(pnew) == -1) {
12445 PyObject_FREE(_PyUnicode_WSTR(pnew));
12446 goto onError;
12447 }
12448
Benjamin Peterson14339b62009-01-31 16:36:08 +000012449 Py_DECREF(tmp);
12450 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451
12452 onError:
12453 _Py_ForgetReference((PyObject *)pnew);
12454 PyObject_Del(pnew);
12455 Py_DECREF(tmp);
12456 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012457}
12458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012459PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012461\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012462Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012463encoding defaults to the current default string encoding.\n\
12464errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012465
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012466static PyObject *unicode_iter(PyObject *seq);
12467
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012469 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012470 "str", /* tp_name */
12471 sizeof(PyUnicodeObject), /* tp_size */
12472 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012474 (destructor)unicode_dealloc, /* tp_dealloc */
12475 0, /* tp_print */
12476 0, /* tp_getattr */
12477 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012478 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012479 unicode_repr, /* tp_repr */
12480 &unicode_as_number, /* tp_as_number */
12481 &unicode_as_sequence, /* tp_as_sequence */
12482 &unicode_as_mapping, /* tp_as_mapping */
12483 (hashfunc) unicode_hash, /* tp_hash*/
12484 0, /* tp_call*/
12485 (reprfunc) unicode_str, /* tp_str */
12486 PyObject_GenericGetAttr, /* tp_getattro */
12487 0, /* tp_setattro */
12488 0, /* tp_as_buffer */
12489 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012490 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012491 unicode_doc, /* tp_doc */
12492 0, /* tp_traverse */
12493 0, /* tp_clear */
12494 PyUnicode_RichCompare, /* tp_richcompare */
12495 0, /* tp_weaklistoffset */
12496 unicode_iter, /* tp_iter */
12497 0, /* tp_iternext */
12498 unicode_methods, /* tp_methods */
12499 0, /* tp_members */
12500 0, /* tp_getset */
12501 &PyBaseObject_Type, /* tp_base */
12502 0, /* tp_dict */
12503 0, /* tp_descr_get */
12504 0, /* tp_descr_set */
12505 0, /* tp_dictoffset */
12506 0, /* tp_init */
12507 0, /* tp_alloc */
12508 unicode_new, /* tp_new */
12509 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510};
12511
12512/* Initialize the Unicode implementation */
12513
Thomas Wouters78890102000-07-22 19:25:51 +000012514void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012516 int i;
12517
Thomas Wouters477c8d52006-05-27 19:21:47 +000012518 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012520 0x000A, /* LINE FEED */
12521 0x000D, /* CARRIAGE RETURN */
12522 0x001C, /* FILE SEPARATOR */
12523 0x001D, /* GROUP SEPARATOR */
12524 0x001E, /* RECORD SEPARATOR */
12525 0x0085, /* NEXT LINE */
12526 0x2028, /* LINE SEPARATOR */
12527 0x2029, /* PARAGRAPH SEPARATOR */
12528 };
12529
Fred Drakee4315f52000-05-09 19:53:39 +000012530 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012532 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012534
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012535 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012536 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012537 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012538 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012539
12540 /* initialize the linebreak bloom filter */
12541 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012543 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012544
12545 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546}
12547
12548/* Finalize the Unicode implementation */
12549
Christian Heimesa156e092008-02-16 07:38:31 +000012550int
12551PyUnicode_ClearFreeList(void)
12552{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012554}
12555
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556void
Thomas Wouters78890102000-07-22 19:25:51 +000012557_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012559 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012561 Py_XDECREF(unicode_empty);
12562 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012563
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012564 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012565 if (unicode_latin1[i]) {
12566 Py_DECREF(unicode_latin1[i]);
12567 unicode_latin1[i] = NULL;
12568 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012569 }
Christian Heimesa156e092008-02-16 07:38:31 +000012570 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012572
Walter Dörwald16807132007-05-25 13:52:07 +000012573void
12574PyUnicode_InternInPlace(PyObject **p)
12575{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012576 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12577 PyObject *t;
12578 if (s == NULL || !PyUnicode_Check(s))
12579 Py_FatalError(
12580 "PyUnicode_InternInPlace: unicode strings only please!");
12581 /* If it's a subclass, we don't really know what putting
12582 it in the interned dict might do. */
12583 if (!PyUnicode_CheckExact(s))
12584 return;
12585 if (PyUnicode_CHECK_INTERNED(s))
12586 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 if (PyUnicode_READY(s) == -1) {
12588 assert(0 && "ready fail in intern...");
12589 return;
12590 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012591 if (interned == NULL) {
12592 interned = PyDict_New();
12593 if (interned == NULL) {
12594 PyErr_Clear(); /* Don't leave an exception */
12595 return;
12596 }
12597 }
12598 /* It might be that the GetItem call fails even
12599 though the key is present in the dictionary,
12600 namely when this happens during a stack overflow. */
12601 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012602 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012603 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012604
Benjamin Peterson29060642009-01-31 22:14:21 +000012605 if (t) {
12606 Py_INCREF(t);
12607 Py_DECREF(*p);
12608 *p = t;
12609 return;
12610 }
Walter Dörwald16807132007-05-25 13:52:07 +000012611
Benjamin Peterson14339b62009-01-31 16:36:08 +000012612 PyThreadState_GET()->recursion_critical = 1;
12613 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12614 PyErr_Clear();
12615 PyThreadState_GET()->recursion_critical = 0;
12616 return;
12617 }
12618 PyThreadState_GET()->recursion_critical = 0;
12619 /* The two references in interned are not counted by refcnt.
12620 The deallocator will take care of this */
12621 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012623}
12624
12625void
12626PyUnicode_InternImmortal(PyObject **p)
12627{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12629
Benjamin Peterson14339b62009-01-31 16:36:08 +000012630 PyUnicode_InternInPlace(p);
12631 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012633 Py_INCREF(*p);
12634 }
Walter Dörwald16807132007-05-25 13:52:07 +000012635}
12636
12637PyObject *
12638PyUnicode_InternFromString(const char *cp)
12639{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012640 PyObject *s = PyUnicode_FromString(cp);
12641 if (s == NULL)
12642 return NULL;
12643 PyUnicode_InternInPlace(&s);
12644 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012645}
12646
Alexander Belopolsky40018472011-02-26 01:02:56 +000012647void
12648_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012649{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012650 PyObject *keys;
12651 PyUnicodeObject *s;
12652 Py_ssize_t i, n;
12653 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012654
Benjamin Peterson14339b62009-01-31 16:36:08 +000012655 if (interned == NULL || !PyDict_Check(interned))
12656 return;
12657 keys = PyDict_Keys(interned);
12658 if (keys == NULL || !PyList_Check(keys)) {
12659 PyErr_Clear();
12660 return;
12661 }
Walter Dörwald16807132007-05-25 13:52:07 +000012662
Benjamin Peterson14339b62009-01-31 16:36:08 +000012663 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12664 detector, interned unicode strings are not forcibly deallocated;
12665 rather, we give them their stolen references back, and then clear
12666 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012667
Benjamin Peterson14339b62009-01-31 16:36:08 +000012668 n = PyList_GET_SIZE(keys);
12669 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012670 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012671 for (i = 0; i < n; i++) {
12672 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 if (PyUnicode_READY(s) == -1)
12674 fprintf(stderr, "could not ready string\n");
12675 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012676 case SSTATE_NOT_INTERNED:
12677 /* XXX Shouldn't happen */
12678 break;
12679 case SSTATE_INTERNED_IMMORTAL:
12680 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012682 break;
12683 case SSTATE_INTERNED_MORTAL:
12684 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012686 break;
12687 default:
12688 Py_FatalError("Inconsistent interned string state.");
12689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012691 }
12692 fprintf(stderr, "total size of all interned strings: "
12693 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12694 "mortal/immortal\n", mortal_size, immortal_size);
12695 Py_DECREF(keys);
12696 PyDict_Clear(interned);
12697 Py_DECREF(interned);
12698 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012699}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012700
12701
12702/********************* Unicode Iterator **************************/
12703
12704typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012705 PyObject_HEAD
12706 Py_ssize_t it_index;
12707 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012708} unicodeiterobject;
12709
12710static void
12711unicodeiter_dealloc(unicodeiterobject *it)
12712{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012713 _PyObject_GC_UNTRACK(it);
12714 Py_XDECREF(it->it_seq);
12715 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012716}
12717
12718static int
12719unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12720{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012721 Py_VISIT(it->it_seq);
12722 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012723}
12724
12725static PyObject *
12726unicodeiter_next(unicodeiterobject *it)
12727{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012728 PyUnicodeObject *seq;
12729 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012730
Benjamin Peterson14339b62009-01-31 16:36:08 +000012731 assert(it != NULL);
12732 seq = it->it_seq;
12733 if (seq == NULL)
12734 return NULL;
12735 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12738 int kind = PyUnicode_KIND(seq);
12739 void *data = PyUnicode_DATA(seq);
12740 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12741 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012742 if (item != NULL)
12743 ++it->it_index;
12744 return item;
12745 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012746
Benjamin Peterson14339b62009-01-31 16:36:08 +000012747 Py_DECREF(seq);
12748 it->it_seq = NULL;
12749 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012750}
12751
12752static PyObject *
12753unicodeiter_len(unicodeiterobject *it)
12754{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012755 Py_ssize_t len = 0;
12756 if (it->it_seq)
12757 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12758 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012759}
12760
12761PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12762
12763static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012764 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012765 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012766 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012767};
12768
12769PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012770 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12771 "str_iterator", /* tp_name */
12772 sizeof(unicodeiterobject), /* tp_basicsize */
12773 0, /* tp_itemsize */
12774 /* methods */
12775 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12776 0, /* tp_print */
12777 0, /* tp_getattr */
12778 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012779 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012780 0, /* tp_repr */
12781 0, /* tp_as_number */
12782 0, /* tp_as_sequence */
12783 0, /* tp_as_mapping */
12784 0, /* tp_hash */
12785 0, /* tp_call */
12786 0, /* tp_str */
12787 PyObject_GenericGetAttr, /* tp_getattro */
12788 0, /* tp_setattro */
12789 0, /* tp_as_buffer */
12790 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12791 0, /* tp_doc */
12792 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12793 0, /* tp_clear */
12794 0, /* tp_richcompare */
12795 0, /* tp_weaklistoffset */
12796 PyObject_SelfIter, /* tp_iter */
12797 (iternextfunc)unicodeiter_next, /* tp_iternext */
12798 unicodeiter_methods, /* tp_methods */
12799 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012800};
12801
12802static PyObject *
12803unicode_iter(PyObject *seq)
12804{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012805 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012806
Benjamin Peterson14339b62009-01-31 16:36:08 +000012807 if (!PyUnicode_Check(seq)) {
12808 PyErr_BadInternalCall();
12809 return NULL;
12810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 if (PyUnicode_READY(seq) == -1)
12812 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012813 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12814 if (it == NULL)
12815 return NULL;
12816 it->it_index = 0;
12817 Py_INCREF(seq);
12818 it->it_seq = (PyUnicodeObject *)seq;
12819 _PyObject_GC_TRACK(it);
12820 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012821}
12822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823#define UNIOP(x) Py_UNICODE_##x
12824#define UNIOP_t Py_UNICODE
12825#include "uniops.h"
12826#undef UNIOP
12827#undef UNIOP_t
12828#define UNIOP(x) Py_UCS4_##x
12829#define UNIOP_t Py_UCS4
12830#include "uniops.h"
12831#undef UNIOP
12832#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012833
Victor Stinner71133ff2010-09-01 23:43:53 +000012834Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012835PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012836{
12837 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12838 Py_UNICODE *copy;
12839 Py_ssize_t size;
12840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012841 if (!PyUnicode_Check(unicode)) {
12842 PyErr_BadArgument();
12843 return NULL;
12844 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012845 /* Ensure we won't overflow the size. */
12846 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12847 PyErr_NoMemory();
12848 return NULL;
12849 }
12850 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12851 size *= sizeof(Py_UNICODE);
12852 copy = PyMem_Malloc(size);
12853 if (copy == NULL) {
12854 PyErr_NoMemory();
12855 return NULL;
12856 }
12857 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12858 return copy;
12859}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012860
Georg Brandl66c221e2010-10-14 07:04:07 +000012861/* A _string module, to export formatter_parser and formatter_field_name_split
12862 to the string.Formatter class implemented in Python. */
12863
12864static PyMethodDef _string_methods[] = {
12865 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12866 METH_O, PyDoc_STR("split the argument as a field name")},
12867 {"formatter_parser", (PyCFunction) formatter_parser,
12868 METH_O, PyDoc_STR("parse the argument as a format string")},
12869 {NULL, NULL}
12870};
12871
12872static struct PyModuleDef _string_module = {
12873 PyModuleDef_HEAD_INIT,
12874 "_string",
12875 PyDoc_STR("string helper module"),
12876 0,
12877 _string_methods,
12878 NULL,
12879 NULL,
12880 NULL,
12881 NULL
12882};
12883
12884PyMODINIT_FUNC
12885PyInit__string(void)
12886{
12887 return PyModule_Create(&_string_module);
12888}
12889
12890
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012891#ifdef __cplusplus
12892}
12893#endif