blob: 7a7f1d2ca01b6b3708a833d924f2fc0a3ecbc396 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
109#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
110#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
111#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
112#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
113#define _PyUnicode_KIND(op) \
114 (assert(PyUnicode_Check(op)), \
115 ((PyASCIIObject *)(op))->state.kind)
116#define _PyUnicode_GET_LENGTH(op) \
117 (assert(PyUnicode_Check(op)), \
118 ((PyASCIIObject *)(op))->length)
119
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200120/* The Unicode string has been modified: reset the hash */
121#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123
Walter Dörwald16807132007-05-25 13:52:07 +0000124/* This dictionary holds all interned unicode strings. Note that references
125 to strings in this dictionary are *not* counted in the string's ob_refcnt.
126 When the interned string reaches a refcnt of 0 the string deallocation
127 function will delete the reference from this dictionary.
128
129 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000130 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000131*/
132static PyObject *interned;
133
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000134/* The empty Unicode object is shared to improve performance. */
135static PyUnicodeObject *unicode_empty;
136
137/* Single character Unicode strings in the Latin-1 range are being
138 shared as well. */
139static PyUnicodeObject *unicode_latin1[256];
140
Christian Heimes190d79e2008-01-30 11:58:22 +0000141/* Fast detection of the most frequent whitespace characters */
142const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000143 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000144/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000145/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000146/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000147/* case 0x000C: * FORM FEED */
148/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000149 0, 1, 1, 1, 1, 1, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000151/* case 0x001C: * FILE SEPARATOR */
152/* case 0x001D: * GROUP SEPARATOR */
153/* case 0x001E: * RECORD SEPARATOR */
154/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000155 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000156/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000157 1, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000161
Benjamin Peterson14339b62009-01-31 16:36:08 +0000162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000170};
171
Alexander Belopolsky40018472011-02-26 01:02:56 +0000172static PyObject *
173unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000174 PyObject **errorHandler,const char *encoding, const char *reason,
175 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
176 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
177
Alexander Belopolsky40018472011-02-26 01:02:56 +0000178static void
179raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300180 const char *encoding,
181 const Py_UNICODE *unicode, Py_ssize_t size,
182 Py_ssize_t startpos, Py_ssize_t endpos,
183 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000184
Christian Heimes190d79e2008-01-30 11:58:22 +0000185/* Same for linebreaks */
186static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000187 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000188/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189/* 0x000B, * LINE TABULATION */
190/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* 0x001C, * FILE SEPARATOR */
195/* 0x001D, * GROUP SEPARATOR */
196/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 0, 0, 0, 1, 1, 1, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000202
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000211};
212
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300213/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
214 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000215Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000216PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000217{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000218#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000220#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000221 /* This is actually an illegal character, so it should
222 not be passed to unichr. */
223 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000224#endif
225}
226
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227/* --- Bloom Filters ----------------------------------------------------- */
228
229/* stuff to implement simple "bloom filters" for Unicode characters.
230 to keep things simple, we use a single bitmask, using the least 5
231 bits from each unicode characters as the bit index. */
232
233/* the linebreak mask is set up by Unicode_Init below */
234
Antoine Pitrouf068f942010-01-13 14:19:12 +0000235#if LONG_BIT >= 128
236#define BLOOM_WIDTH 128
237#elif LONG_BIT >= 64
238#define BLOOM_WIDTH 64
239#elif LONG_BIT >= 32
240#define BLOOM_WIDTH 32
241#else
242#error "LONG_BIT is smaller than 32"
243#endif
244
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245#define BLOOM_MASK unsigned long
246
247static BLOOM_MASK bloom_linebreak;
248
Antoine Pitrouf068f942010-01-13 14:19:12 +0000249#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
250#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000251
Benjamin Peterson29060642009-01-31 22:14:21 +0000252#define BLOOM_LINEBREAK(ch) \
253 ((ch) < 128U ? ascii_linebreak[(ch)] : \
254 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000255
Alexander Belopolsky40018472011-02-26 01:02:56 +0000256Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200257make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000258{
259 /* calculate simple bloom-style bitmask for a given unicode string */
260
Antoine Pitrouf068f942010-01-13 14:19:12 +0000261 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000262 Py_ssize_t i;
263
264 mask = 0;
265 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200266 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267
268 return mask;
269}
270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200271#define BLOOM_MEMBER(mask, chr, str) \
272 (BLOOM(mask, chr) \
273 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000274
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275/* --- Unicode Object ----------------------------------------------------- */
276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200277static PyObject *
278substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
279
280static PyObject *
281fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
282
283Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
284 Py_ssize_t size, Py_UCS4 ch,
285 int direction)
286{
287 /* like wcschr, but doesn't stop at NULL characters */
288 Py_ssize_t i;
289 if (direction == 1) {
290 for(i = 0; i < size; i++)
291 if (PyUnicode_READ(kind, s, i) == ch)
292 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
293 }
294 else {
295 for(i = size-1; i >= 0; i--)
296 if (PyUnicode_READ(kind, s, i) == ch)
297 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
298 }
299 return NULL;
300}
301
Alexander Belopolsky40018472011-02-26 01:02:56 +0000302static int
303unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200304 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200308 /* Resizing is only supported for old unicode objects. */
309 assert(!PyUnicode_IS_COMPACT(unicode));
310 assert(_PyUnicode_WSTR(unicode) != NULL);
311
312 /* ... and only if they have not been readied yet, because
313 callees usually rely on the wstr representation when resizing. */
314 assert(unicode->data.any == NULL);
315
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000316 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200317 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000318 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320 /* Resizing shared object (unicode_empty or single character
321 objects) in-place is not allowed. Use PyUnicode_Resize()
322 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323
Benjamin Peterson14339b62009-01-31 16:36:08 +0000324 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200325 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
326 _PyUnicode_WSTR(unicode)[0] < 256U &&
327 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000328 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000329 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 return -1;
331 }
332
Thomas Wouters477c8d52006-05-27 19:21:47 +0000333 /* We allocate one more byte to make sure the string is Ux0000 terminated.
334 The overallocation is also used by fastsearch, which assumes that it's
335 safe to look at str[length] (without making any assumptions about what
336 it contains). */
337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200338 oldstr = _PyUnicode_WSTR(unicode);
339 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
340 sizeof(Py_UNICODE) * (length + 1));
341 if (!_PyUnicode_WSTR(unicode)) {
342 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 PyErr_NoMemory();
344 return -1;
345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 _PyUnicode_WSTR(unicode)[length] = 0;
347 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 if (unicode->data.any != NULL) {
351 PyObject_FREE(unicode->data.any);
352 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
353 PyObject_FREE(unicode->_base.utf8);
354 }
355 unicode->_base.utf8 = NULL;
356 unicode->_base.utf8_length = 0;
357 unicode->data.any = NULL;
358 _PyUnicode_LENGTH(unicode) = 0;
359 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
360 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200362 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000363
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364 return 0;
365}
366
367/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000368 Ux0000 terminated; some code (e.g. new_identifier)
369 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370
371 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000372 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373
374*/
375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200376#ifdef Py_DEBUG
377int unicode_old_new_calls = 0;
378#endif
379
Alexander Belopolsky40018472011-02-26 01:02:56 +0000380static PyUnicodeObject *
381_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382{
383 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385
Thomas Wouters477c8d52006-05-27 19:21:47 +0000386 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387 if (length == 0 && unicode_empty != NULL) {
388 Py_INCREF(unicode_empty);
389 return unicode_empty;
390 }
391
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000392 /* Ensure we won't overflow the size. */
393 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
394 return (PyUnicodeObject *)PyErr_NoMemory();
395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200396 if (length < 0) {
397 PyErr_SetString(PyExc_SystemError,
398 "Negative size passed to _PyUnicode_New");
399 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400 }
401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200402#ifdef Py_DEBUG
403 ++unicode_old_new_calls;
404#endif
405
406 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
407 if (unicode == NULL)
408 return NULL;
409 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
410 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
411 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000412 PyErr_NoMemory();
413 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000414 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200415
Jeremy Hyltond8082792003-09-16 19:41:39 +0000416 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000417 * the caller fails before initializing str -- unicode_resize()
418 * reads str[0], and the Keep-Alive optimization can keep memory
419 * allocated for str alive across a call to unicode_dealloc(unicode).
420 * We don't want unicode_resize to read uninitialized memory in
421 * that case.
422 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200423 _PyUnicode_WSTR(unicode)[0] = 0;
424 _PyUnicode_WSTR(unicode)[length] = 0;
425 _PyUnicode_WSTR_LENGTH(unicode) = length;
426 _PyUnicode_HASH(unicode) = -1;
427 _PyUnicode_STATE(unicode).interned = 0;
428 _PyUnicode_STATE(unicode).kind = 0;
429 _PyUnicode_STATE(unicode).compact = 0;
430 _PyUnicode_STATE(unicode).ready = 0;
431 _PyUnicode_STATE(unicode).ascii = 0;
432 unicode->data.any = NULL;
433 _PyUnicode_LENGTH(unicode) = 0;
434 unicode->_base.utf8 = NULL;
435 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000436 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000437
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000439 /* XXX UNREF/NEWREF interface should be more symmetrical */
440 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000441 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000442 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000443 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444}
445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200446#ifdef Py_DEBUG
447int unicode_new_new_calls = 0;
448
449/* Functions wrapping macros for use in debugger */
450char *_PyUnicode_utf8(void *unicode){
451 return _PyUnicode_UTF8(unicode);
452}
453
454void *_PyUnicode_compact_data(void *unicode) {
455 return _PyUnicode_COMPACT_DATA(unicode);
456}
457void *_PyUnicode_data(void *unicode){
458 printf("obj %p\n", unicode);
459 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
460 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
461 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
462 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
463 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
464 return PyUnicode_DATA(unicode);
465}
466#endif
467
468PyObject *
469PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
470{
471 PyObject *obj;
472 PyCompactUnicodeObject *unicode;
473 void *data;
474 int kind_state;
475 int is_sharing = 0, is_ascii = 0;
476 Py_ssize_t char_size;
477 Py_ssize_t struct_size;
478
479 /* Optimization for empty strings */
480 if (size == 0 && unicode_empty != NULL) {
481 Py_INCREF(unicode_empty);
482 return (PyObject *)unicode_empty;
483 }
484
485#ifdef Py_DEBUG
486 ++unicode_new_new_calls;
487#endif
488
489 struct_size = sizeof(PyCompactUnicodeObject);
490 if (maxchar < 128) {
491 kind_state = PyUnicode_1BYTE_KIND;
492 char_size = 1;
493 is_ascii = 1;
494 struct_size = sizeof(PyASCIIObject);
495 }
496 else if (maxchar < 256) {
497 kind_state = PyUnicode_1BYTE_KIND;
498 char_size = 1;
499 }
500 else if (maxchar < 65536) {
501 kind_state = PyUnicode_2BYTE_KIND;
502 char_size = 2;
503 if (sizeof(wchar_t) == 2)
504 is_sharing = 1;
505 }
506 else {
507 kind_state = PyUnicode_4BYTE_KIND;
508 char_size = 4;
509 if (sizeof(wchar_t) == 4)
510 is_sharing = 1;
511 }
512
513 /* Ensure we won't overflow the size. */
514 if (size < 0) {
515 PyErr_SetString(PyExc_SystemError,
516 "Negative size passed to PyUnicode_New");
517 return NULL;
518 }
519 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
520 return PyErr_NoMemory();
521
522 /* Duplicated allocation code from _PyObject_New() instead of a call to
523 * PyObject_New() so we are able to allocate space for the object and
524 * it's data buffer.
525 */
526 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
527 if (obj == NULL)
528 return PyErr_NoMemory();
529 obj = PyObject_INIT(obj, &PyUnicode_Type);
530 if (obj == NULL)
531 return NULL;
532
533 unicode = (PyCompactUnicodeObject *)obj;
534 if (is_ascii)
535 data = ((PyASCIIObject*)obj) + 1;
536 else
537 data = unicode + 1;
538 _PyUnicode_LENGTH(unicode) = size;
539 _PyUnicode_HASH(unicode) = -1;
540 _PyUnicode_STATE(unicode).interned = 0;
541 _PyUnicode_STATE(unicode).kind = kind_state;
542 _PyUnicode_STATE(unicode).compact = 1;
543 _PyUnicode_STATE(unicode).ready = 1;
544 _PyUnicode_STATE(unicode).ascii = is_ascii;
545 if (is_ascii) {
546 ((char*)data)[size] = 0;
547 _PyUnicode_WSTR(unicode) = NULL;
548 }
549 else if (kind_state == PyUnicode_1BYTE_KIND) {
550 ((char*)data)[size] = 0;
551 _PyUnicode_WSTR(unicode) = NULL;
552 _PyUnicode_WSTR_LENGTH(unicode) = 0;
553 unicode->utf8_length = 0;
554 unicode->utf8 = NULL;
555 }
556 else {
557 unicode->utf8 = NULL;
558 if (kind_state == PyUnicode_2BYTE_KIND)
559 ((Py_UCS2*)data)[size] = 0;
560 else /* kind_state == PyUnicode_4BYTE_KIND */
561 ((Py_UCS4*)data)[size] = 0;
562 if (is_sharing) {
563 _PyUnicode_WSTR_LENGTH(unicode) = size;
564 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
565 }
566 else {
567 _PyUnicode_WSTR_LENGTH(unicode) = 0;
568 _PyUnicode_WSTR(unicode) = NULL;
569 }
570 }
571 return obj;
572}
573
574#if SIZEOF_WCHAR_T == 2
575/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
576 will decode surrogate pairs, the other conversions are implemented as macros
577 for efficency.
578
579 This function assumes that unicode can hold one more code point than wstr
580 characters for a terminating null character. */
581static int
582unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
583 PyUnicodeObject *unicode)
584{
585 const wchar_t *iter;
586 Py_UCS4 *ucs4_out;
587
588 assert(unicode && PyUnicode_Check(unicode));
589 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
590 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
591
592 for (iter = begin; iter < end; ) {
593 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
594 _PyUnicode_GET_LENGTH(unicode)));
595 if (*iter >= 0xD800 && *iter <= 0xDBFF
596 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
597 {
598 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
599 iter += 2;
600 }
601 else {
602 *ucs4_out++ = *iter;
603 iter++;
604 }
605 }
606 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
607 _PyUnicode_GET_LENGTH(unicode)));
608
609 return 0;
610}
611#endif
612
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200613Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200614PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
615 PyObject *from, Py_ssize_t from_start,
616 Py_ssize_t how_many)
617{
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200618 unsigned int from_kind;
619 unsigned int to_kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200620
621 assert(PyUnicode_Check(from));
622 assert(PyUnicode_Check(to));
623
624 if (PyUnicode_READY(from))
625 return -1;
626 if (PyUnicode_READY(to))
627 return -1;
628
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200629 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200630 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
631 PyErr_Format(PyExc_ValueError,
632 "Cannot write %zi characters at %zi "
633 "in a string of %zi characters",
634 how_many, to_start, PyUnicode_GET_LENGTH(to));
635 return -1;
636 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200637 if (how_many == 0)
638 return 0;
639
640 if (Py_REFCNT(to) != 1) {
641 PyErr_SetString(PyExc_ValueError,
642 "Cannot modify a string having more than 1 reference");
643 return -1;
644 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200645 _PyUnicode_DIRTY(unicode);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647 from_kind = PyUnicode_KIND(from);
648 to_kind = PyUnicode_KIND(to);
649
650 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200651 /* fast path */
652 Py_MEMCPY((char*)PyUnicode_DATA(to)
653 + PyUnicode_KIND_SIZE(to_kind, to_start),
654 (char*)PyUnicode_DATA(from)
655 + PyUnicode_KIND_SIZE(from_kind, from_start),
656 PyUnicode_KIND_SIZE(to_kind, how_many));
657 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200658 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200659
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200660 if (from_kind > to_kind) {
661 /* slow path to check for character overflow */
662 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
663 void *from_data = PyUnicode_DATA(from);
664 void *to_data = PyUnicode_DATA(to);
665 Py_UCS4 ch, maxchar;
666 Py_ssize_t i;
667 int overflow;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200668
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200669 maxchar = 0;
Victor Stinner73f01c62011-09-28 22:28:04 +0200670 overflow = 0;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200671 for (i=0; i < how_many; i++) {
672 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
673 if (ch > maxchar) {
674 maxchar = ch;
675 if (maxchar > to_maxchar) {
676 overflow = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677 break;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200679 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200680 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
681 }
682 if (!overflow)
683 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200684 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200685 else if (from_kind == PyUnicode_1BYTE_KIND && to_kind == PyUnicode_2BYTE_KIND)
686 {
687 _PyUnicode_CONVERT_BYTES(
688 Py_UCS1, Py_UCS2,
689 PyUnicode_1BYTE_DATA(from) + from_start,
690 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
691 PyUnicode_2BYTE_DATA(to) + to_start
692 );
693 return how_many;
694 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200695 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200696 && to_kind == PyUnicode_4BYTE_KIND)
697 {
698 _PyUnicode_CONVERT_BYTES(
699 Py_UCS1, Py_UCS4,
700 PyUnicode_1BYTE_DATA(from) + from_start,
701 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
702 PyUnicode_4BYTE_DATA(to) + to_start
703 );
704 return how_many;
705 }
706 else if (from_kind == PyUnicode_2BYTE_KIND
707 && to_kind == PyUnicode_4BYTE_KIND)
708 {
709 _PyUnicode_CONVERT_BYTES(
710 Py_UCS2, Py_UCS4,
711 PyUnicode_2BYTE_DATA(from) + from_start,
712 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
713 PyUnicode_4BYTE_DATA(to) + to_start
714 );
715 return how_many;
716 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717 PyErr_Format(PyExc_ValueError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200718 "Cannot copy UCS%u characters "
719 "into a string of UCS%u characters",
Victor Stinner157f83f2011-09-28 21:41:31 +0200720 1 << (from_kind - 1),
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200721 1 << (to_kind -1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200722 return -1;
723}
724
Victor Stinner17222162011-09-28 22:15:37 +0200725/* Find the maximum code point and count the number of surrogate pairs so a
726 correct string length can be computed before converting a string to UCS4.
727 This function counts single surrogates as a character and not as a pair.
728
729 Return 0 on success, or -1 on error. */
730static int
731find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
732 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200733{
734 const wchar_t *iter;
735
736 if (num_surrogates == NULL || maxchar == NULL) {
737 PyErr_SetString(PyExc_SystemError,
738 "unexpected NULL arguments to "
739 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
740 return -1;
741 }
742
743 *num_surrogates = 0;
744 *maxchar = 0;
745
746 for (iter = begin; iter < end; ) {
747 if (*iter > *maxchar)
748 *maxchar = *iter;
749#if SIZEOF_WCHAR_T == 2
750 if (*iter >= 0xD800 && *iter <= 0xDBFF
751 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
752 {
753 Py_UCS4 surrogate_val;
754 surrogate_val = (((iter[0] & 0x3FF)<<10)
755 | (iter[1] & 0x3FF)) + 0x10000;
756 ++(*num_surrogates);
757 if (surrogate_val > *maxchar)
758 *maxchar = surrogate_val;
759 iter += 2;
760 }
761 else
762 iter++;
763#else
764 iter++;
765#endif
766 }
767 return 0;
768}
769
770#ifdef Py_DEBUG
771int unicode_ready_calls = 0;
772#endif
773
774int
775_PyUnicode_Ready(PyUnicodeObject *unicode)
776{
777 wchar_t *end;
778 Py_UCS4 maxchar = 0;
779 Py_ssize_t num_surrogates;
780#if SIZEOF_WCHAR_T == 2
781 Py_ssize_t length_wo_surrogates;
782#endif
783
784 assert(PyUnicode_Check(unicode));
785
786 if (unicode->data.any != NULL) {
787 assert(PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
788 return 0;
789 }
790
791 /* _PyUnicode_Ready() is only intented for old-style API usage where
792 * strings were created using _PyObject_New() and where no canonical
793 * representation (the str field) has been set yet aka strings
794 * which are not yet ready.
795 */
796 assert(_PyUnicode_WSTR(unicode) != NULL);
797 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
798 assert(!PyUnicode_IS_COMPACT(unicode));
799 assert(!PyUnicode_IS_READY(unicode));
800 /* Actually, it should neither be interned nor be anything else: */
801 assert(_PyUnicode_STATE(unicode).interned == 0);
802 assert(unicode->_base.utf8 == NULL);
803
804#ifdef Py_DEBUG
805 ++unicode_ready_calls;
806#endif
807
808 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200809 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810 &maxchar,
811 &num_surrogates) == -1) {
812 assert(0 && "PyUnicode_FindMaxCharAndNumSurrogatePairs failed");
813 return -1;
814 }
815
816 if (maxchar < 256) {
817 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
818 if (!unicode->data.any) {
819 PyErr_NoMemory();
820 return -1;
821 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200822 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200823 _PyUnicode_WSTR(unicode), end,
824 PyUnicode_1BYTE_DATA(unicode));
825 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
826 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
827 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
828 if (maxchar < 128) {
829 unicode->_base.utf8 = unicode->data.any;
830 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
831 }
832 else {
833 unicode->_base.utf8 = NULL;
834 unicode->_base.utf8_length = 0;
835 }
836 PyObject_FREE(_PyUnicode_WSTR(unicode));
837 _PyUnicode_WSTR(unicode) = NULL;
838 _PyUnicode_WSTR_LENGTH(unicode) = 0;
839 }
840 /* In this case we might have to convert down from 4-byte native
841 wchar_t to 2-byte unicode. */
842 else if (maxchar < 65536) {
843 assert(num_surrogates == 0 &&
844 "FindMaxCharAndNumSurrogatePairs() messed up");
845
Victor Stinner506f5922011-09-28 22:34:18 +0200846#if SIZEOF_WCHAR_T == 2
847 /* We can share representations and are done. */
848 unicode->data.any = _PyUnicode_WSTR(unicode);
849 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
850 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
851 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
852 unicode->_base.utf8 = NULL;
853 unicode->_base.utf8_length = 0;
854#else
855 /* sizeof(wchar_t) == 4 */
856 unicode->data.any = PyObject_MALLOC(
857 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
858 if (!unicode->data.any) {
859 PyErr_NoMemory();
860 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 }
Victor Stinner506f5922011-09-28 22:34:18 +0200862 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
863 _PyUnicode_WSTR(unicode), end,
864 PyUnicode_2BYTE_DATA(unicode));
865 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
866 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
867 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
868 unicode->_base.utf8 = NULL;
869 unicode->_base.utf8_length = 0;
870 PyObject_FREE(_PyUnicode_WSTR(unicode));
871 _PyUnicode_WSTR(unicode) = NULL;
872 _PyUnicode_WSTR_LENGTH(unicode) = 0;
873#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874 }
875 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
876 else {
877#if SIZEOF_WCHAR_T == 2
878 /* in case the native representation is 2-bytes, we need to allocate a
879 new normalized 4-byte version. */
880 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
881 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
882 if (!unicode->data.any) {
883 PyErr_NoMemory();
884 return -1;
885 }
886 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
887 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
888 unicode->_base.utf8 = NULL;
889 unicode->_base.utf8_length = 0;
890 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
891 unicode) < 0) {
892 assert(0 && "ConvertWideCharToUCS4 failed");
893 return -1;
894 }
895 PyObject_FREE(_PyUnicode_WSTR(unicode));
896 _PyUnicode_WSTR(unicode) = NULL;
897 _PyUnicode_WSTR_LENGTH(unicode) = 0;
898#else
899 assert(num_surrogates == 0);
900
901 unicode->data.any = _PyUnicode_WSTR(unicode);
902 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
903 unicode->_base.utf8 = NULL;
904 unicode->_base.utf8_length = 0;
905 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
906#endif
907 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
908 }
909 _PyUnicode_STATE(unicode).ready = 1;
910 return 0;
911}
912
Alexander Belopolsky40018472011-02-26 01:02:56 +0000913static void
914unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000915{
Walter Dörwald16807132007-05-25 13:52:07 +0000916 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000917 case SSTATE_NOT_INTERNED:
918 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000919
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 case SSTATE_INTERNED_MORTAL:
921 /* revive dead object temporarily for DelItem */
922 Py_REFCNT(unicode) = 3;
923 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
924 Py_FatalError(
925 "deletion of interned string failed");
926 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000927
Benjamin Peterson29060642009-01-31 22:14:21 +0000928 case SSTATE_INTERNED_IMMORTAL:
929 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000930
Benjamin Peterson29060642009-01-31 22:14:21 +0000931 default:
932 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000933 }
934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200935 if (_PyUnicode_WSTR(unicode) &&
936 (!PyUnicode_IS_READY(unicode) ||
937 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
938 PyObject_DEL(_PyUnicode_WSTR(unicode));
939 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
940 PyObject_DEL(unicode->_base.utf8);
941
942 if (PyUnicode_IS_COMPACT(unicode)) {
943 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944 }
945 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200946 if (unicode->data.any)
947 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000949 }
950}
951
Alexander Belopolsky40018472011-02-26 01:02:56 +0000952static int
953_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000954{
955 register PyUnicodeObject *v;
956
957 /* Argument checks */
958 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000959 PyErr_BadInternalCall();
960 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000961 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000962 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
964 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000965 PyErr_BadInternalCall();
966 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000967 }
968
969 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970 possible since these are being shared.
971 The same goes for new-representation unicode objects or objects which
972 have already been readied.
973 For these, we simply return a fresh copy with the same Unicode content.
974 */
975 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
976 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
977 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000978 PyUnicodeObject *w = _PyUnicode_New(length);
979 if (w == NULL)
980 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
982 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000983 Py_DECREF(*unicode);
984 *unicode = w;
985 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000986 }
987
988 /* Note that we don't have to modify *unicode for unshared Unicode
989 objects, since we can modify them in-place. */
990 return unicode_resize(v, length);
991}
992
Alexander Belopolsky40018472011-02-26 01:02:56 +0000993int
994PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000995{
996 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
997}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999static PyObject*
1000get_latin1_char(unsigned char ch)
1001{
1002 PyUnicodeObject *unicode = unicode_latin1[ch];
1003 if (!unicode) {
1004 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1005 if (!unicode)
1006 return NULL;
1007 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1008 unicode_latin1[ch] = unicode;
1009 }
1010 Py_INCREF(unicode);
1011 return (PyObject *)unicode;
1012}
1013
Alexander Belopolsky40018472011-02-26 01:02:56 +00001014PyObject *
1015PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001016{
1017 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001018 Py_UCS4 maxchar = 0;
1019 Py_ssize_t num_surrogates;
1020
1021 if (u == NULL)
1022 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001024 /* If the Unicode data is known at construction time, we can apply
1025 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027 /* Optimization for empty strings */
1028 if (size == 0 && unicode_empty != NULL) {
1029 Py_INCREF(unicode_empty);
1030 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001031 }
Tim Petersced69f82003-09-16 20:30:58 +00001032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033 /* Single character Unicode objects in the Latin-1 range are
1034 shared when using this constructor */
1035 if (size == 1 && *u < 256)
1036 return get_latin1_char((unsigned char)*u);
1037
1038 /* If not empty and not single character, copy the Unicode data
1039 into the new object */
Victor Stinner17222162011-09-28 22:15:37 +02001040 if (find_maxchar_surrogates(u, u + size, &maxchar,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 &num_surrogates) == -1)
1042 return NULL;
1043
1044 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1045 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046 if (!unicode)
1047 return NULL;
1048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 switch (PyUnicode_KIND(unicode)) {
1050 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001051 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1053 break;
1054 case PyUnicode_2BYTE_KIND:
1055#if Py_UNICODE_SIZE == 2
1056 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1057#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001058 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1060#endif
1061 break;
1062 case PyUnicode_4BYTE_KIND:
1063#if SIZEOF_WCHAR_T == 2
1064 /* This is the only case which has to process surrogates, thus
1065 a simple copy loop is not enough and we need a function. */
1066 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1067 Py_DECREF(unicode);
1068 return NULL;
1069 }
1070#else
1071 assert(num_surrogates == 0);
1072 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1073#endif
1074 break;
1075 default:
1076 assert(0 && "Impossible state");
1077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079 return (PyObject *)unicode;
1080}
1081
Alexander Belopolsky40018472011-02-26 01:02:56 +00001082PyObject *
1083PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001084{
1085 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001086
Benjamin Peterson14339b62009-01-31 16:36:08 +00001087 if (size < 0) {
1088 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001089 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001090 return NULL;
1091 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001092
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001093 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001094 some optimizations which share commonly used objects.
1095 Also, this means the input must be UTF-8, so fall back to the
1096 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001097 if (u != NULL) {
1098
Benjamin Peterson29060642009-01-31 22:14:21 +00001099 /* Optimization for empty strings */
1100 if (size == 0 && unicode_empty != NULL) {
1101 Py_INCREF(unicode_empty);
1102 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001103 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001104
1105 /* Single characters are shared when using this constructor.
1106 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 if (size == 1 && Py_CHARMASK(*u) < 128)
1108 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001109
1110 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001111 }
1112
Walter Dörwald55507312007-05-18 13:12:10 +00001113 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001114 if (!unicode)
1115 return NULL;
1116
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001117 return (PyObject *)unicode;
1118}
1119
Alexander Belopolsky40018472011-02-26 01:02:56 +00001120PyObject *
1121PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001122{
1123 size_t size = strlen(u);
1124 if (size > PY_SSIZE_T_MAX) {
1125 PyErr_SetString(PyExc_OverflowError, "input too long");
1126 return NULL;
1127 }
1128
1129 return PyUnicode_FromStringAndSize(u, size);
1130}
1131
Victor Stinnere57b1c02011-09-28 22:20:48 +02001132static PyObject*
1133_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001134{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135 PyObject *res;
1136 unsigned char max = 127;
1137 Py_ssize_t i;
1138 for (i = 0; i < size; i++) {
1139 if (u[i] & 0x80) {
1140 max = 255;
1141 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001142 }
1143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 res = PyUnicode_New(size, max);
1145 if (!res)
1146 return NULL;
1147 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1148 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001149}
1150
Victor Stinnere57b1c02011-09-28 22:20:48 +02001151static PyObject*
1152_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153{
1154 PyObject *res;
1155 Py_UCS2 max = 0;
1156 Py_ssize_t i;
1157 for (i = 0; i < size; i++)
1158 if (u[i] > max)
1159 max = u[i];
1160 res = PyUnicode_New(size, max);
1161 if (!res)
1162 return NULL;
1163 if (max >= 256)
1164 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1165 else
1166 for (i = 0; i < size; i++)
1167 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1168 return res;
1169}
1170
Victor Stinnere57b1c02011-09-28 22:20:48 +02001171static PyObject*
1172_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173{
1174 PyObject *res;
1175 Py_UCS4 max = 0;
1176 Py_ssize_t i;
1177 for (i = 0; i < size; i++)
1178 if (u[i] > max)
1179 max = u[i];
1180 res = PyUnicode_New(size, max);
1181 if (!res)
1182 return NULL;
1183 if (max >= 0x10000)
1184 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1185 else {
1186 int kind = PyUnicode_KIND(res);
1187 void *data = PyUnicode_DATA(res);
1188 for (i = 0; i < size; i++)
1189 PyUnicode_WRITE(kind, data, i, u[i]);
1190 }
1191 return res;
1192}
1193
1194PyObject*
1195PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1196{
1197 switch(kind) {
1198 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001199 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001201 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001203 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204 }
1205 assert(0);
1206 return NULL;
1207}
1208
1209
1210/* Widen Unicode objects to larger buffers.
1211 Return NULL if the string is too wide already. */
1212
1213void*
1214_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1215{
1216 Py_ssize_t i;
1217 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1218 void *d = PyUnicode_DATA(s);
1219 unsigned int skind = PyUnicode_KIND(s);
1220 if (PyUnicode_KIND(s) >= kind) {
1221 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1222 return NULL;
1223 }
1224 switch(kind) {
1225 case PyUnicode_2BYTE_KIND: {
1226 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1227 if (!result) {
1228 PyErr_NoMemory();
1229 return 0;
1230 }
1231 for (i = 0; i < len; i++)
1232 result[i] = ((Py_UCS1*)d)[i];
1233 return result;
1234 }
1235 case PyUnicode_4BYTE_KIND: {
1236 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1237 if (!result) {
1238 PyErr_NoMemory();
1239 return 0;
1240 }
1241 for (i = 0; i < len; i++)
1242 result[i] = PyUnicode_READ(skind, d, i);
1243 return result;
1244 }
1245 }
1246 Py_FatalError("invalid kind");
1247 return NULL;
1248}
1249
1250static Py_UCS4*
1251as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1252 int copy_null)
1253{
1254 int kind;
1255 void *data;
1256 Py_ssize_t len, targetlen;
1257 if (PyUnicode_READY(string) == -1)
1258 return NULL;
1259 kind = PyUnicode_KIND(string);
1260 data = PyUnicode_DATA(string);
1261 len = PyUnicode_GET_LENGTH(string);
1262 targetlen = len;
1263 if (copy_null)
1264 targetlen++;
1265 if (!target) {
1266 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1267 PyErr_NoMemory();
1268 return NULL;
1269 }
1270 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1271 if (!target) {
1272 PyErr_NoMemory();
1273 return NULL;
1274 }
1275 }
1276 else {
1277 if (targetsize < targetlen) {
1278 PyErr_Format(PyExc_SystemError,
1279 "string is longer than the buffer");
1280 if (copy_null && 0 < targetsize)
1281 target[0] = 0;
1282 return NULL;
1283 }
1284 }
1285 if (kind != PyUnicode_4BYTE_KIND) {
1286 Py_ssize_t i;
1287 for (i = 0; i < len; i++)
1288 target[i] = PyUnicode_READ(kind, data, i);
1289 }
1290 else
1291 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1292 if (copy_null)
1293 target[len] = 0;
1294 return target;
1295}
1296
1297Py_UCS4*
1298PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1299 int copy_null)
1300{
1301 if (target == NULL || targetsize < 1) {
1302 PyErr_BadInternalCall();
1303 return NULL;
1304 }
1305 return as_ucs4(string, target, targetsize, copy_null);
1306}
1307
1308Py_UCS4*
1309PyUnicode_AsUCS4Copy(PyObject *string)
1310{
1311 return as_ucs4(string, NULL, 0, 1);
1312}
1313
1314#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001315
Alexander Belopolsky40018472011-02-26 01:02:56 +00001316PyObject *
1317PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001320 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001322 PyErr_BadInternalCall();
1323 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 }
1325
Martin v. Löwis790465f2008-04-05 20:41:37 +00001326 if (size == -1) {
1327 size = wcslen(w);
1328 }
1329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331}
1332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001334
Walter Dörwald346737f2007-05-31 10:44:43 +00001335static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001336makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1337 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001338{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001339 *fmt++ = '%';
1340 if (width) {
1341 if (zeropad)
1342 *fmt++ = '0';
1343 fmt += sprintf(fmt, "%d", width);
1344 }
1345 if (precision)
1346 fmt += sprintf(fmt, ".%d", precision);
1347 if (longflag)
1348 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001349 else if (longlongflag) {
1350 /* longlongflag should only ever be nonzero on machines with
1351 HAVE_LONG_LONG defined */
1352#ifdef HAVE_LONG_LONG
1353 char *f = PY_FORMAT_LONG_LONG;
1354 while (*f)
1355 *fmt++ = *f++;
1356#else
1357 /* we shouldn't ever get here */
1358 assert(0);
1359 *fmt++ = 'l';
1360#endif
1361 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001362 else if (size_tflag) {
1363 char *f = PY_FORMAT_SIZE_T;
1364 while (*f)
1365 *fmt++ = *f++;
1366 }
1367 *fmt++ = c;
1368 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001369}
1370
Victor Stinner96865452011-03-01 23:44:09 +00001371/* helper for PyUnicode_FromFormatV() */
1372
1373static const char*
1374parse_format_flags(const char *f,
1375 int *p_width, int *p_precision,
1376 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1377{
1378 int width, precision, longflag, longlongflag, size_tflag;
1379
1380 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1381 f++;
1382 width = 0;
1383 while (Py_ISDIGIT((unsigned)*f))
1384 width = (width*10) + *f++ - '0';
1385 precision = 0;
1386 if (*f == '.') {
1387 f++;
1388 while (Py_ISDIGIT((unsigned)*f))
1389 precision = (precision*10) + *f++ - '0';
1390 if (*f == '%') {
1391 /* "%.3%s" => f points to "3" */
1392 f--;
1393 }
1394 }
1395 if (*f == '\0') {
1396 /* bogus format "%.1" => go backward, f points to "1" */
1397 f--;
1398 }
1399 if (p_width != NULL)
1400 *p_width = width;
1401 if (p_precision != NULL)
1402 *p_precision = precision;
1403
1404 /* Handle %ld, %lu, %lld and %llu. */
1405 longflag = 0;
1406 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001407 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001408
1409 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001410 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001411 longflag = 1;
1412 ++f;
1413 }
1414#ifdef HAVE_LONG_LONG
1415 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001416 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001417 longlongflag = 1;
1418 f += 2;
1419 }
1420#endif
1421 }
1422 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001423 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001424 size_tflag = 1;
1425 ++f;
1426 }
1427 if (p_longflag != NULL)
1428 *p_longflag = longflag;
1429 if (p_longlongflag != NULL)
1430 *p_longlongflag = longlongflag;
1431 if (p_size_tflag != NULL)
1432 *p_size_tflag = size_tflag;
1433 return f;
1434}
1435
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001436/* maximum number of characters required for output of %ld. 21 characters
1437 allows for 64-bit integers (in decimal) and an optional sign. */
1438#define MAX_LONG_CHARS 21
1439/* maximum number of characters required for output of %lld.
1440 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1441 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1442#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1443
Walter Dörwaldd2034312007-05-18 16:29:38 +00001444PyObject *
1445PyUnicode_FromFormatV(const char *format, va_list vargs)
1446{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001447 va_list count;
1448 Py_ssize_t callcount = 0;
1449 PyObject **callresults = NULL;
1450 PyObject **callresult = NULL;
1451 Py_ssize_t n = 0;
1452 int width = 0;
1453 int precision = 0;
1454 int zeropad;
1455 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001457 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001458 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1460 Py_UCS4 argmaxchar;
1461 Py_ssize_t numbersize = 0;
1462 char *numberresults = NULL;
1463 char *numberresult = NULL;
1464 Py_ssize_t i;
1465 int kind;
1466 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001467
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001468 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001469 /* step 1: count the number of %S/%R/%A/%s format specifications
1470 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1471 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 * result in an array)
1473 * also esimate a upper bound for all the number formats in the string,
1474 * numbers will be formated in step 3 and be keept in a '\0'-separated
1475 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001476 for (f = format; *f; f++) {
1477 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001478 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1480 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1481 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1482 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001485#ifdef HAVE_LONG_LONG
1486 if (longlongflag) {
1487 if (width < MAX_LONG_LONG_CHARS)
1488 width = MAX_LONG_LONG_CHARS;
1489 }
1490 else
1491#endif
1492 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1493 including sign. Decimal takes the most space. This
1494 isn't enough for octal. If a width is specified we
1495 need more (which we allocate later). */
1496 if (width < MAX_LONG_CHARS)
1497 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498
1499 /* account for the size + '\0' to separate numbers
1500 inside of the numberresults buffer */
1501 numbersize += (width + 1);
1502 }
1503 }
1504 else if ((unsigned char)*f > 127) {
1505 PyErr_Format(PyExc_ValueError,
1506 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1507 "string, got a non-ASCII byte: 0x%02x",
1508 (unsigned char)*f);
1509 return NULL;
1510 }
1511 }
1512 /* step 2: allocate memory for the results of
1513 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1514 if (callcount) {
1515 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1516 if (!callresults) {
1517 PyErr_NoMemory();
1518 return NULL;
1519 }
1520 callresult = callresults;
1521 }
1522 /* step 2.5: allocate memory for the results of formating numbers */
1523 if (numbersize) {
1524 numberresults = PyObject_Malloc(numbersize);
1525 if (!numberresults) {
1526 PyErr_NoMemory();
1527 goto fail;
1528 }
1529 numberresult = numberresults;
1530 }
1531
1532 /* step 3: format numbers and figure out how large a buffer we need */
1533 for (f = format; *f; f++) {
1534 if (*f == '%') {
1535 const char* p;
1536 int longflag;
1537 int longlongflag;
1538 int size_tflag;
1539 int numprinted;
1540
1541 p = f;
1542 zeropad = (f[1] == '0');
1543 f = parse_format_flags(f, &width, &precision,
1544 &longflag, &longlongflag, &size_tflag);
1545 switch (*f) {
1546 case 'c':
1547 {
1548 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001549 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 n++;
1551 break;
1552 }
1553 case '%':
1554 n++;
1555 break;
1556 case 'i':
1557 case 'd':
1558 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1559 width, precision, *f);
1560 if (longflag)
1561 numprinted = sprintf(numberresult, fmt,
1562 va_arg(count, long));
1563#ifdef HAVE_LONG_LONG
1564 else if (longlongflag)
1565 numprinted = sprintf(numberresult, fmt,
1566 va_arg(count, PY_LONG_LONG));
1567#endif
1568 else if (size_tflag)
1569 numprinted = sprintf(numberresult, fmt,
1570 va_arg(count, Py_ssize_t));
1571 else
1572 numprinted = sprintf(numberresult, fmt,
1573 va_arg(count, int));
1574 n += numprinted;
1575 /* advance by +1 to skip over the '\0' */
1576 numberresult += (numprinted + 1);
1577 assert(*(numberresult - 1) == '\0');
1578 assert(*(numberresult - 2) != '\0');
1579 assert(numprinted >= 0);
1580 assert(numberresult <= numberresults + numbersize);
1581 break;
1582 case 'u':
1583 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1584 width, precision, 'u');
1585 if (longflag)
1586 numprinted = sprintf(numberresult, fmt,
1587 va_arg(count, unsigned long));
1588#ifdef HAVE_LONG_LONG
1589 else if (longlongflag)
1590 numprinted = sprintf(numberresult, fmt,
1591 va_arg(count, unsigned PY_LONG_LONG));
1592#endif
1593 else if (size_tflag)
1594 numprinted = sprintf(numberresult, fmt,
1595 va_arg(count, size_t));
1596 else
1597 numprinted = sprintf(numberresult, fmt,
1598 va_arg(count, unsigned int));
1599 n += numprinted;
1600 numberresult += (numprinted + 1);
1601 assert(*(numberresult - 1) == '\0');
1602 assert(*(numberresult - 2) != '\0');
1603 assert(numprinted >= 0);
1604 assert(numberresult <= numberresults + numbersize);
1605 break;
1606 case 'x':
1607 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1608 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1609 n += numprinted;
1610 numberresult += (numprinted + 1);
1611 assert(*(numberresult - 1) == '\0');
1612 assert(*(numberresult - 2) != '\0');
1613 assert(numprinted >= 0);
1614 assert(numberresult <= numberresults + numbersize);
1615 break;
1616 case 'p':
1617 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1618 /* %p is ill-defined: ensure leading 0x. */
1619 if (numberresult[1] == 'X')
1620 numberresult[1] = 'x';
1621 else if (numberresult[1] != 'x') {
1622 memmove(numberresult + 2, numberresult,
1623 strlen(numberresult) + 1);
1624 numberresult[0] = '0';
1625 numberresult[1] = 'x';
1626 numprinted += 2;
1627 }
1628 n += numprinted;
1629 numberresult += (numprinted + 1);
1630 assert(*(numberresult - 1) == '\0');
1631 assert(*(numberresult - 2) != '\0');
1632 assert(numprinted >= 0);
1633 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001634 break;
1635 case 's':
1636 {
1637 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001638 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001639 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1640 if (!str)
1641 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 /* since PyUnicode_DecodeUTF8 returns already flexible
1643 unicode objects, there is no need to call ready on them */
1644 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001645 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001647 /* Remember the str and switch to the next slot */
1648 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001649 break;
1650 }
1651 case 'U':
1652 {
1653 PyObject *obj = va_arg(count, PyObject *);
1654 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 if (PyUnicode_READY(obj) == -1)
1656 goto fail;
1657 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001658 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001660 break;
1661 }
1662 case 'V':
1663 {
1664 PyObject *obj = va_arg(count, PyObject *);
1665 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001666 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001667 assert(obj || str);
1668 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001669 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 if (PyUnicode_READY(obj) == -1)
1671 goto fail;
1672 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001673 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001675 *callresult++ = NULL;
1676 }
1677 else {
1678 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1679 if (!str_obj)
1680 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001682 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001684 *callresult++ = str_obj;
1685 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001686 break;
1687 }
1688 case 'S':
1689 {
1690 PyObject *obj = va_arg(count, PyObject *);
1691 PyObject *str;
1692 assert(obj);
1693 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001695 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001697 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001699 /* Remember the str and switch to the next slot */
1700 *callresult++ = str;
1701 break;
1702 }
1703 case 'R':
1704 {
1705 PyObject *obj = va_arg(count, PyObject *);
1706 PyObject *repr;
1707 assert(obj);
1708 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001710 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001712 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001714 /* Remember the repr and switch to the next slot */
1715 *callresult++ = repr;
1716 break;
1717 }
1718 case 'A':
1719 {
1720 PyObject *obj = va_arg(count, PyObject *);
1721 PyObject *ascii;
1722 assert(obj);
1723 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001725 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001727 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001729 /* Remember the repr and switch to the next slot */
1730 *callresult++ = ascii;
1731 break;
1732 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001733 default:
1734 /* if we stumble upon an unknown
1735 formatting code, copy the rest of
1736 the format string to the output
1737 string. (we cannot just skip the
1738 code, since there's no way to know
1739 what's in the argument list) */
1740 n += strlen(p);
1741 goto expand;
1742 }
1743 } else
1744 n++;
1745 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001746 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001747 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001749 we don't have to resize the string.
1750 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001752 if (!string)
1753 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 kind = PyUnicode_KIND(string);
1755 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001756 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001760 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001761 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001762
1763 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1765 /* checking for == because the last argument could be a empty
1766 string, which causes i to point to end, the assert at the end of
1767 the loop */
1768 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001769
Benjamin Peterson14339b62009-01-31 16:36:08 +00001770 switch (*f) {
1771 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001772 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 const int ordinal = va_arg(vargs, int);
1774 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001775 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001776 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001777 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001778 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001779 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001780 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 case 'p':
1782 /* unused, since we already have the result */
1783 if (*f == 'p')
1784 (void) va_arg(vargs, void *);
1785 else
1786 (void) va_arg(vargs, int);
1787 /* extract the result from numberresults and append. */
1788 for (; *numberresult; ++i, ++numberresult)
1789 PyUnicode_WRITE(kind, data, i, *numberresult);
1790 /* skip over the separating '\0' */
1791 assert(*numberresult == '\0');
1792 numberresult++;
1793 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001794 break;
1795 case 's':
1796 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001797 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001799 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 size = PyUnicode_GET_LENGTH(*callresult);
1801 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001802 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1803 *callresult, 0,
1804 size) < 0)
1805 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001806 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001807 /* We're done with the unicode()/repr() => forget it */
1808 Py_DECREF(*callresult);
1809 /* switch to next unicode()/repr() result */
1810 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001811 break;
1812 }
1813 case 'U':
1814 {
1815 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 Py_ssize_t size;
1817 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1818 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001819 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1820 obj, 0,
1821 size) < 0)
1822 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001824 break;
1825 }
1826 case 'V':
1827 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001829 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001830 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001831 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 size = PyUnicode_GET_LENGTH(obj);
1833 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001834 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1835 obj, 0,
1836 size) < 0)
1837 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001839 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 size = PyUnicode_GET_LENGTH(*callresult);
1841 assert(PyUnicode_KIND(*callresult) <=
1842 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001843 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1844 *callresult,
1845 0, size) < 0)
1846 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001847 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001848 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001849 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001850 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001851 break;
1852 }
1853 case 'S':
1854 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001855 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001856 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001857 /* unused, since we already have the result */
1858 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001859 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001860 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1861 *callresult, 0,
1862 PyUnicode_GET_LENGTH(*callresult)) < 0)
1863 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001865 /* We're done with the unicode()/repr() => forget it */
1866 Py_DECREF(*callresult);
1867 /* switch to next unicode()/repr() result */
1868 ++callresult;
1869 break;
1870 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001871 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001872 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001873 break;
1874 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 for (; *p; ++p, ++i)
1876 PyUnicode_WRITE(kind, data, i, *p);
1877 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001878 goto end;
1879 }
Victor Stinner1205f272010-09-11 00:54:47 +00001880 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881 else {
1882 assert(i < PyUnicode_GET_LENGTH(string));
1883 PyUnicode_WRITE(kind, data, i++, *f);
1884 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001887
Benjamin Peterson29060642009-01-31 22:14:21 +00001888 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001889 if (callresults)
1890 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891 if (numberresults)
1892 PyObject_Free(numberresults);
1893 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001894 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001895 if (callresults) {
1896 PyObject **callresult2 = callresults;
1897 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001898 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001899 ++callresult2;
1900 }
1901 PyObject_Free(callresults);
1902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 if (numberresults)
1904 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001905 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001906}
1907
Walter Dörwaldd2034312007-05-18 16:29:38 +00001908PyObject *
1909PyUnicode_FromFormat(const char *format, ...)
1910{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001911 PyObject* ret;
1912 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001913
1914#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001915 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001916#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001917 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001918#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001919 ret = PyUnicode_FromFormatV(format, vargs);
1920 va_end(vargs);
1921 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001922}
1923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001924#ifdef HAVE_WCHAR_H
1925
Victor Stinner5593d8a2010-10-02 11:11:27 +00001926/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1927 convert a Unicode object to a wide character string.
1928
Victor Stinnerd88d9832011-09-06 02:00:05 +02001929 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001930 character) required to convert the unicode object. Ignore size argument.
1931
Victor Stinnerd88d9832011-09-06 02:00:05 +02001932 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001933 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001934 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001935static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001936unicode_aswidechar(PyUnicodeObject *unicode,
1937 wchar_t *w,
1938 Py_ssize_t size)
1939{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001940 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941 const wchar_t *wstr;
1942
1943 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1944 if (wstr == NULL)
1945 return -1;
1946
Victor Stinner5593d8a2010-10-02 11:11:27 +00001947 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001948 if (size > res)
1949 size = res + 1;
1950 else
1951 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001953 return res;
1954 }
1955 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001956 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001957}
1958
1959Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001960PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001961 wchar_t *w,
1962 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963{
1964 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001965 PyErr_BadInternalCall();
1966 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001968 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969}
1970
Victor Stinner137c34c2010-09-29 10:25:54 +00001971wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001972PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001973 Py_ssize_t *size)
1974{
1975 wchar_t* buffer;
1976 Py_ssize_t buflen;
1977
1978 if (unicode == NULL) {
1979 PyErr_BadInternalCall();
1980 return NULL;
1981 }
1982
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001983 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 if (buflen == -1)
1985 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001986 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001987 PyErr_NoMemory();
1988 return NULL;
1989 }
1990
Victor Stinner137c34c2010-09-29 10:25:54 +00001991 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1992 if (buffer == NULL) {
1993 PyErr_NoMemory();
1994 return NULL;
1995 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001996 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 if (buflen == -1)
1998 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001999 if (size != NULL)
2000 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002001 return buffer;
2002}
2003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005
Alexander Belopolsky40018472011-02-26 01:02:56 +00002006PyObject *
2007PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002008{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002010 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002011 PyErr_SetString(PyExc_ValueError,
2012 "chr() arg not in range(0x110000)");
2013 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002014 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 if (ordinal < 256)
2017 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 v = PyUnicode_New(1, ordinal);
2020 if (v == NULL)
2021 return NULL;
2022 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2023 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002024}
2025
Alexander Belopolsky40018472011-02-26 01:02:56 +00002026PyObject *
2027PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002029 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002030 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002031 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002032 Py_INCREF(obj);
2033 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002034 }
2035 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002036 /* For a Unicode subtype that's not a Unicode object,
2037 return a true Unicode object with the same data. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038 if (PyUnicode_READY(obj) == -1)
2039 return NULL;
2040 return substring((PyUnicodeObject *)obj, 0, PyUnicode_GET_LENGTH(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002041 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002042 PyErr_Format(PyExc_TypeError,
2043 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002044 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002045 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002046}
2047
Alexander Belopolsky40018472011-02-26 01:02:56 +00002048PyObject *
2049PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002050 const char *encoding,
2051 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002052{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002053 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002054 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002055
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002057 PyErr_BadInternalCall();
2058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002060
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002061 /* Decoding bytes objects is the most common case and should be fast */
2062 if (PyBytes_Check(obj)) {
2063 if (PyBytes_GET_SIZE(obj) == 0) {
2064 Py_INCREF(unicode_empty);
2065 v = (PyObject *) unicode_empty;
2066 }
2067 else {
2068 v = PyUnicode_Decode(
2069 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2070 encoding, errors);
2071 }
2072 return v;
2073 }
2074
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002075 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002076 PyErr_SetString(PyExc_TypeError,
2077 "decoding str is not supported");
2078 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002079 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002080
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002081 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2082 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2083 PyErr_Format(PyExc_TypeError,
2084 "coercing to str: need bytes, bytearray "
2085 "or buffer-like object, %.80s found",
2086 Py_TYPE(obj)->tp_name);
2087 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002088 }
Tim Petersced69f82003-09-16 20:30:58 +00002089
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002090 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002092 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 }
Tim Petersced69f82003-09-16 20:30:58 +00002094 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002095 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002096
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002097 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002098 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099}
2100
Victor Stinner600d3be2010-06-10 12:00:55 +00002101/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002102 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2103 1 on success. */
2104static int
2105normalize_encoding(const char *encoding,
2106 char *lower,
2107 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002109 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002110 char *l;
2111 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002112
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002113 e = encoding;
2114 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002115 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002116 while (*e) {
2117 if (l == l_end)
2118 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002119 if (Py_ISUPPER(*e)) {
2120 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002121 }
2122 else if (*e == '_') {
2123 *l++ = '-';
2124 e++;
2125 }
2126 else {
2127 *l++ = *e++;
2128 }
2129 }
2130 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002131 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002132}
2133
Alexander Belopolsky40018472011-02-26 01:02:56 +00002134PyObject *
2135PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002136 Py_ssize_t size,
2137 const char *encoding,
2138 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002139{
2140 PyObject *buffer = NULL, *unicode;
2141 Py_buffer info;
2142 char lower[11]; /* Enough for any encoding shortcut */
2143
2144 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002145 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002146
2147 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002148 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002149 if ((strcmp(lower, "utf-8") == 0) ||
2150 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002151 return PyUnicode_DecodeUTF8(s, size, errors);
2152 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002153 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002154 (strcmp(lower, "iso-8859-1") == 0))
2155 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002156#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002157 else if (strcmp(lower, "mbcs") == 0)
2158 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002159#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002160 else if (strcmp(lower, "ascii") == 0)
2161 return PyUnicode_DecodeASCII(s, size, errors);
2162 else if (strcmp(lower, "utf-16") == 0)
2163 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2164 else if (strcmp(lower, "utf-32") == 0)
2165 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167
2168 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002169 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002170 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002171 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002172 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173 if (buffer == NULL)
2174 goto onError;
2175 unicode = PyCodec_Decode(buffer, encoding, errors);
2176 if (unicode == NULL)
2177 goto onError;
2178 if (!PyUnicode_Check(unicode)) {
2179 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002180 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002181 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 Py_DECREF(unicode);
2183 goto onError;
2184 }
2185 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 if (PyUnicode_READY(unicode)) {
2187 Py_DECREF(unicode);
2188 return NULL;
2189 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002191
Benjamin Peterson29060642009-01-31 22:14:21 +00002192 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 Py_XDECREF(buffer);
2194 return NULL;
2195}
2196
Alexander Belopolsky40018472011-02-26 01:02:56 +00002197PyObject *
2198PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002199 const char *encoding,
2200 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002201{
2202 PyObject *v;
2203
2204 if (!PyUnicode_Check(unicode)) {
2205 PyErr_BadArgument();
2206 goto onError;
2207 }
2208
2209 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002210 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002211
2212 /* Decode via the codec registry */
2213 v = PyCodec_Decode(unicode, encoding, errors);
2214 if (v == NULL)
2215 goto onError;
2216 return v;
2217
Benjamin Peterson29060642009-01-31 22:14:21 +00002218 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002219 return NULL;
2220}
2221
Alexander Belopolsky40018472011-02-26 01:02:56 +00002222PyObject *
2223PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002224 const char *encoding,
2225 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002226{
2227 PyObject *v;
2228
2229 if (!PyUnicode_Check(unicode)) {
2230 PyErr_BadArgument();
2231 goto onError;
2232 }
2233
2234 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002235 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002236
2237 /* Decode via the codec registry */
2238 v = PyCodec_Decode(unicode, encoding, errors);
2239 if (v == NULL)
2240 goto onError;
2241 if (!PyUnicode_Check(v)) {
2242 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002243 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002244 Py_TYPE(v)->tp_name);
2245 Py_DECREF(v);
2246 goto onError;
2247 }
2248 return v;
2249
Benjamin Peterson29060642009-01-31 22:14:21 +00002250 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002251 return NULL;
2252}
2253
Alexander Belopolsky40018472011-02-26 01:02:56 +00002254PyObject *
2255PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002256 Py_ssize_t size,
2257 const char *encoding,
2258 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259{
2260 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002261
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 unicode = PyUnicode_FromUnicode(s, size);
2263 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2266 Py_DECREF(unicode);
2267 return v;
2268}
2269
Alexander Belopolsky40018472011-02-26 01:02:56 +00002270PyObject *
2271PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002272 const char *encoding,
2273 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002274{
2275 PyObject *v;
2276
2277 if (!PyUnicode_Check(unicode)) {
2278 PyErr_BadArgument();
2279 goto onError;
2280 }
2281
2282 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002283 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002284
2285 /* Encode via the codec registry */
2286 v = PyCodec_Encode(unicode, encoding, errors);
2287 if (v == NULL)
2288 goto onError;
2289 return v;
2290
Benjamin Peterson29060642009-01-31 22:14:21 +00002291 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002292 return NULL;
2293}
2294
Victor Stinnerad158722010-10-27 00:25:46 +00002295PyObject *
2296PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002297{
Victor Stinner99b95382011-07-04 14:23:54 +02002298#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002299 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2300 PyUnicode_GET_SIZE(unicode),
2301 NULL);
2302#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002303 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002304#else
Victor Stinner793b5312011-04-27 00:24:21 +02002305 PyInterpreterState *interp = PyThreadState_GET()->interp;
2306 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2307 cannot use it to encode and decode filenames before it is loaded. Load
2308 the Python codec requires to encode at least its own filename. Use the C
2309 version of the locale codec until the codec registry is initialized and
2310 the Python codec is loaded.
2311
2312 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2313 cannot only rely on it: check also interp->fscodec_initialized for
2314 subinterpreters. */
2315 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002316 return PyUnicode_AsEncodedString(unicode,
2317 Py_FileSystemDefaultEncoding,
2318 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002319 }
2320 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002321 /* locale encoding with surrogateescape */
2322 wchar_t *wchar;
2323 char *bytes;
2324 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002325 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002326
2327 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2328 if (wchar == NULL)
2329 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002330 bytes = _Py_wchar2char(wchar, &error_pos);
2331 if (bytes == NULL) {
2332 if (error_pos != (size_t)-1) {
2333 char *errmsg = strerror(errno);
2334 PyObject *exc = NULL;
2335 if (errmsg == NULL)
2336 errmsg = "Py_wchar2char() failed";
2337 raise_encode_exception(&exc,
2338 "filesystemencoding",
2339 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2340 error_pos, error_pos+1,
2341 errmsg);
2342 Py_XDECREF(exc);
2343 }
2344 else
2345 PyErr_NoMemory();
2346 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002347 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002348 }
2349 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002350
2351 bytes_obj = PyBytes_FromString(bytes);
2352 PyMem_Free(bytes);
2353 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002354 }
Victor Stinnerad158722010-10-27 00:25:46 +00002355#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002356}
2357
Alexander Belopolsky40018472011-02-26 01:02:56 +00002358PyObject *
2359PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002360 const char *encoding,
2361 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362{
2363 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002364 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002365
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366 if (!PyUnicode_Check(unicode)) {
2367 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002368 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369 }
Fred Drakee4315f52000-05-09 19:53:39 +00002370
Victor Stinner2f283c22011-03-02 01:21:46 +00002371 if (encoding == NULL) {
2372 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002373 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002374 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002376 }
Fred Drakee4315f52000-05-09 19:53:39 +00002377
2378 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002379 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002380 if ((strcmp(lower, "utf-8") == 0) ||
2381 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002382 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002383 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002385 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002387 }
Victor Stinner37296e82010-06-10 13:36:23 +00002388 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002389 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002390 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002392#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002393 else if (strcmp(lower, "mbcs") == 0)
2394 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2395 PyUnicode_GET_SIZE(unicode),
2396 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002397#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002398 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002400 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002401
2402 /* Encode via the codec registry */
2403 v = PyCodec_Encode(unicode, encoding, errors);
2404 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002405 return NULL;
2406
2407 /* The normal path */
2408 if (PyBytes_Check(v))
2409 return v;
2410
2411 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002412 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002413 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002414 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002415
2416 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2417 "encoder %s returned bytearray instead of bytes",
2418 encoding);
2419 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002420 Py_DECREF(v);
2421 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002422 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002423
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002424 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2425 Py_DECREF(v);
2426 return b;
2427 }
2428
2429 PyErr_Format(PyExc_TypeError,
2430 "encoder did not return a bytes object (type=%.400s)",
2431 Py_TYPE(v)->tp_name);
2432 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002433 return NULL;
2434}
2435
Alexander Belopolsky40018472011-02-26 01:02:56 +00002436PyObject *
2437PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002438 const char *encoding,
2439 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002440{
2441 PyObject *v;
2442
2443 if (!PyUnicode_Check(unicode)) {
2444 PyErr_BadArgument();
2445 goto onError;
2446 }
2447
2448 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002449 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002450
2451 /* Encode via the codec registry */
2452 v = PyCodec_Encode(unicode, encoding, errors);
2453 if (v == NULL)
2454 goto onError;
2455 if (!PyUnicode_Check(v)) {
2456 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002457 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002458 Py_TYPE(v)->tp_name);
2459 Py_DECREF(v);
2460 goto onError;
2461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002463
Benjamin Peterson29060642009-01-31 22:14:21 +00002464 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 return NULL;
2466}
2467
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002468PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002469PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002470 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002471 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2472}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002473
Christian Heimes5894ba72007-11-04 11:43:14 +00002474PyObject*
2475PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2476{
Victor Stinner99b95382011-07-04 14:23:54 +02002477#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002478 return PyUnicode_DecodeMBCS(s, size, NULL);
2479#elif defined(__APPLE__)
2480 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2481#else
Victor Stinner793b5312011-04-27 00:24:21 +02002482 PyInterpreterState *interp = PyThreadState_GET()->interp;
2483 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2484 cannot use it to encode and decode filenames before it is loaded. Load
2485 the Python codec requires to encode at least its own filename. Use the C
2486 version of the locale codec until the codec registry is initialized and
2487 the Python codec is loaded.
2488
2489 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2490 cannot only rely on it: check also interp->fscodec_initialized for
2491 subinterpreters. */
2492 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002493 return PyUnicode_Decode(s, size,
2494 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002495 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002496 }
2497 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002498 /* locale encoding with surrogateescape */
2499 wchar_t *wchar;
2500 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002501 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002502
2503 if (s[size] != '\0' || size != strlen(s)) {
2504 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2505 return NULL;
2506 }
2507
Victor Stinner168e1172010-10-16 23:16:16 +00002508 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002509 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002510 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002511
Victor Stinner168e1172010-10-16 23:16:16 +00002512 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002513 PyMem_Free(wchar);
2514 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002515 }
Victor Stinnerad158722010-10-27 00:25:46 +00002516#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002517}
2518
Martin v. Löwis011e8422009-05-05 04:43:17 +00002519
2520int
2521PyUnicode_FSConverter(PyObject* arg, void* addr)
2522{
2523 PyObject *output = NULL;
2524 Py_ssize_t size;
2525 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002526 if (arg == NULL) {
2527 Py_DECREF(*(PyObject**)addr);
2528 return 1;
2529 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002530 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002531 output = arg;
2532 Py_INCREF(output);
2533 }
2534 else {
2535 arg = PyUnicode_FromObject(arg);
2536 if (!arg)
2537 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002538 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002539 Py_DECREF(arg);
2540 if (!output)
2541 return 0;
2542 if (!PyBytes_Check(output)) {
2543 Py_DECREF(output);
2544 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2545 return 0;
2546 }
2547 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002548 size = PyBytes_GET_SIZE(output);
2549 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002550 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002551 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002552 Py_DECREF(output);
2553 return 0;
2554 }
2555 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002556 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002557}
2558
2559
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002560int
2561PyUnicode_FSDecoder(PyObject* arg, void* addr)
2562{
2563 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002564 if (arg == NULL) {
2565 Py_DECREF(*(PyObject**)addr);
2566 return 1;
2567 }
2568 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002569 if (PyUnicode_READY(arg))
2570 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002571 output = arg;
2572 Py_INCREF(output);
2573 }
2574 else {
2575 arg = PyBytes_FromObject(arg);
2576 if (!arg)
2577 return 0;
2578 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2579 PyBytes_GET_SIZE(arg));
2580 Py_DECREF(arg);
2581 if (!output)
2582 return 0;
2583 if (!PyUnicode_Check(output)) {
2584 Py_DECREF(output);
2585 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2586 return 0;
2587 }
2588 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2590 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002591 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2592 Py_DECREF(output);
2593 return 0;
2594 }
2595 *(PyObject**)addr = output;
2596 return Py_CLEANUP_SUPPORTED;
2597}
2598
2599
Martin v. Löwis5b222132007-06-10 09:51:05 +00002600char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002602{
Christian Heimesf3863112007-11-22 07:46:41 +00002603 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2605
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002606 if (!PyUnicode_Check(unicode)) {
2607 PyErr_BadArgument();
2608 return NULL;
2609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002611 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612
2613 if (_PyUnicode_UTF8(unicode) == NULL) {
2614 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2615 if (bytes == NULL)
2616 return NULL;
2617 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2618 if (u->_base.utf8 == NULL) {
2619 Py_DECREF(bytes);
2620 return NULL;
2621 }
2622 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2623 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2624 Py_DECREF(bytes);
2625 }
2626
2627 if (psize)
2628 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2629 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002630}
2631
2632char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002634{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2636}
2637
2638#ifdef Py_DEBUG
2639int unicode_as_unicode_calls = 0;
2640#endif
2641
2642
2643Py_UNICODE *
2644PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2645{
2646 PyUnicodeObject *u;
2647 const unsigned char *one_byte;
2648#if SIZEOF_WCHAR_T == 4
2649 const Py_UCS2 *two_bytes;
2650#else
2651 const Py_UCS4 *four_bytes;
2652 const Py_UCS4 *ucs4_end;
2653 Py_ssize_t num_surrogates;
2654#endif
2655 wchar_t *w;
2656 wchar_t *wchar_end;
2657
2658 if (!PyUnicode_Check(unicode)) {
2659 PyErr_BadArgument();
2660 return NULL;
2661 }
2662 u = (PyUnicodeObject*)unicode;
2663 if (_PyUnicode_WSTR(u) == NULL) {
2664 /* Non-ASCII compact unicode object */
2665 assert(_PyUnicode_KIND(u) != 0);
2666 assert(PyUnicode_IS_READY(u));
2667
2668#ifdef Py_DEBUG
2669 ++unicode_as_unicode_calls;
2670#endif
2671
2672 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2673#if SIZEOF_WCHAR_T == 2
2674 four_bytes = PyUnicode_4BYTE_DATA(u);
2675 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2676 num_surrogates = 0;
2677
2678 for (; four_bytes < ucs4_end; ++four_bytes) {
2679 if (*four_bytes > 0xFFFF)
2680 ++num_surrogates;
2681 }
2682
2683 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2684 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2685 if (!_PyUnicode_WSTR(u)) {
2686 PyErr_NoMemory();
2687 return NULL;
2688 }
2689 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2690
2691 w = _PyUnicode_WSTR(u);
2692 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2693 four_bytes = PyUnicode_4BYTE_DATA(u);
2694 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2695 if (*four_bytes > 0xFFFF) {
2696 /* encode surrogate pair in this case */
2697 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2698 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2699 }
2700 else
2701 *w = *four_bytes;
2702
2703 if (w > wchar_end) {
2704 assert(0 && "Miscalculated string end");
2705 }
2706 }
2707 *w = 0;
2708#else
2709 /* sizeof(wchar_t) == 4 */
2710 Py_FatalError("Impossible unicode object state, wstr and str "
2711 "should share memory already.");
2712 return NULL;
2713#endif
2714 }
2715 else {
2716 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2717 (_PyUnicode_LENGTH(u) + 1));
2718 if (!_PyUnicode_WSTR(u)) {
2719 PyErr_NoMemory();
2720 return NULL;
2721 }
2722 if (!PyUnicode_IS_COMPACT_ASCII(u))
2723 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2724 w = _PyUnicode_WSTR(u);
2725 wchar_end = w + _PyUnicode_LENGTH(u);
2726
2727 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2728 one_byte = PyUnicode_1BYTE_DATA(u);
2729 for (; w < wchar_end; ++one_byte, ++w)
2730 *w = *one_byte;
2731 /* null-terminate the wstr */
2732 *w = 0;
2733 }
2734 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2735#if SIZEOF_WCHAR_T == 4
2736 two_bytes = PyUnicode_2BYTE_DATA(u);
2737 for (; w < wchar_end; ++two_bytes, ++w)
2738 *w = *two_bytes;
2739 /* null-terminate the wstr */
2740 *w = 0;
2741#else
2742 /* sizeof(wchar_t) == 2 */
2743 PyObject_FREE(_PyUnicode_WSTR(u));
2744 _PyUnicode_WSTR(u) = NULL;
2745 Py_FatalError("Impossible unicode object state, wstr "
2746 "and str should share memory already.");
2747 return NULL;
2748#endif
2749 }
2750 else {
2751 assert(0 && "This should never happen.");
2752 }
2753 }
2754 }
2755 if (size != NULL)
2756 *size = PyUnicode_WSTR_LENGTH(u);
2757 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002758}
2759
Alexander Belopolsky40018472011-02-26 01:02:56 +00002760Py_UNICODE *
2761PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002763 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764}
2765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002766
Alexander Belopolsky40018472011-02-26 01:02:56 +00002767Py_ssize_t
2768PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769{
2770 if (!PyUnicode_Check(unicode)) {
2771 PyErr_BadArgument();
2772 goto onError;
2773 }
2774 return PyUnicode_GET_SIZE(unicode);
2775
Benjamin Peterson29060642009-01-31 22:14:21 +00002776 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 return -1;
2778}
2779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780Py_ssize_t
2781PyUnicode_GetLength(PyObject *unicode)
2782{
2783 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2784 PyErr_BadArgument();
2785 return -1;
2786 }
2787
2788 return PyUnicode_GET_LENGTH(unicode);
2789}
2790
2791Py_UCS4
2792PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2793{
2794 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2795 return PyErr_BadArgument();
2796 return (Py_UCS4)-1;
2797 }
2798 return PyUnicode_READ_CHAR(unicode, index);
2799}
2800
2801int
2802PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2803{
2804 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2805 return PyErr_BadArgument();
2806 return -1;
2807 }
2808
2809 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2810 index, ch);
2811 return 0;
2812}
2813
Alexander Belopolsky40018472011-02-26 01:02:56 +00002814const char *
2815PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002816{
Victor Stinner42cb4622010-09-01 19:39:01 +00002817 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002818}
2819
Victor Stinner554f3f02010-06-16 23:33:54 +00002820/* create or adjust a UnicodeDecodeError */
2821static void
2822make_decode_exception(PyObject **exceptionObject,
2823 const char *encoding,
2824 const char *input, Py_ssize_t length,
2825 Py_ssize_t startpos, Py_ssize_t endpos,
2826 const char *reason)
2827{
2828 if (*exceptionObject == NULL) {
2829 *exceptionObject = PyUnicodeDecodeError_Create(
2830 encoding, input, length, startpos, endpos, reason);
2831 }
2832 else {
2833 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2834 goto onError;
2835 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2836 goto onError;
2837 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2838 goto onError;
2839 }
2840 return;
2841
2842onError:
2843 Py_DECREF(*exceptionObject);
2844 *exceptionObject = NULL;
2845}
2846
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002847/* error handling callback helper:
2848 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002849 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002850 and adjust various state variables.
2851 return 0 on success, -1 on error
2852*/
2853
Alexander Belopolsky40018472011-02-26 01:02:56 +00002854static int
2855unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002856 const char *encoding, const char *reason,
2857 const char **input, const char **inend, Py_ssize_t *startinpos,
2858 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2859 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002860{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002861 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002862
2863 PyObject *restuple = NULL;
2864 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002865 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002866 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002867 Py_ssize_t requiredsize;
2868 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002869 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002870 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002871 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002872 int res = -1;
2873
2874 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002875 *errorHandler = PyCodec_LookupError(errors);
2876 if (*errorHandler == NULL)
2877 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002878 }
2879
Victor Stinner554f3f02010-06-16 23:33:54 +00002880 make_decode_exception(exceptionObject,
2881 encoding,
2882 *input, *inend - *input,
2883 *startinpos, *endinpos,
2884 reason);
2885 if (*exceptionObject == NULL)
2886 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887
2888 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2889 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002890 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002891 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002892 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002894 }
2895 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002896 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002897
2898 /* Copy back the bytes variables, which might have been modified by the
2899 callback */
2900 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2901 if (!inputobj)
2902 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002903 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002904 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002905 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002906 *input = PyBytes_AS_STRING(inputobj);
2907 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002908 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002909 /* we can DECREF safely, as the exception has another reference,
2910 so the object won't go away. */
2911 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002912
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002914 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002915 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002916 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2917 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002918 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002919
2920 /* need more space? (at least enough for what we
2921 have+the replacement+the rest of the string (starting
2922 at the new input position), so we won't have to check space
2923 when there are no errors in the rest of the string) */
2924 repptr = PyUnicode_AS_UNICODE(repunicode);
2925 repsize = PyUnicode_GET_SIZE(repunicode);
2926 requiredsize = *outpos + repsize + insize-newpos;
2927 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002928 if (requiredsize<2*outsize)
2929 requiredsize = 2*outsize;
2930 if (_PyUnicode_Resize(output, requiredsize) < 0)
2931 goto onError;
2932 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002933 }
2934 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002935 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002936 Py_UNICODE_COPY(*outptr, repptr, repsize);
2937 *outptr += repsize;
2938 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002939
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940 /* we made it! */
2941 res = 0;
2942
Benjamin Peterson29060642009-01-31 22:14:21 +00002943 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002944 Py_XDECREF(restuple);
2945 return res;
2946}
2947
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002948/* --- UTF-7 Codec -------------------------------------------------------- */
2949
Antoine Pitrou244651a2009-05-04 18:56:13 +00002950/* See RFC2152 for details. We encode conservatively and decode liberally. */
2951
2952/* Three simple macros defining base-64. */
2953
2954/* Is c a base-64 character? */
2955
2956#define IS_BASE64(c) \
2957 (((c) >= 'A' && (c) <= 'Z') || \
2958 ((c) >= 'a' && (c) <= 'z') || \
2959 ((c) >= '0' && (c) <= '9') || \
2960 (c) == '+' || (c) == '/')
2961
2962/* given that c is a base-64 character, what is its base-64 value? */
2963
2964#define FROM_BASE64(c) \
2965 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2966 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2967 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2968 (c) == '+' ? 62 : 63)
2969
2970/* What is the base-64 character of the bottom 6 bits of n? */
2971
2972#define TO_BASE64(n) \
2973 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2974
2975/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2976 * decoded as itself. We are permissive on decoding; the only ASCII
2977 * byte not decoding to itself is the + which begins a base64
2978 * string. */
2979
2980#define DECODE_DIRECT(c) \
2981 ((c) <= 127 && (c) != '+')
2982
2983/* The UTF-7 encoder treats ASCII characters differently according to
2984 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2985 * the above). See RFC2152. This array identifies these different
2986 * sets:
2987 * 0 : "Set D"
2988 * alphanumeric and '(),-./:?
2989 * 1 : "Set O"
2990 * !"#$%&*;<=>@[]^_`{|}
2991 * 2 : "whitespace"
2992 * ht nl cr sp
2993 * 3 : special (must be base64 encoded)
2994 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2995 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002996
Tim Petersced69f82003-09-16 20:30:58 +00002997static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002998char utf7_category[128] = {
2999/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3000 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3001/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3002 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3003/* sp ! " # $ % & ' ( ) * + , - . / */
3004 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3005/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3006 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3007/* @ A B C D E F G H I J K L M N O */
3008 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3009/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3010 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3011/* ` a b c d e f g h i j k l m n o */
3012 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3013/* p q r s t u v w x y z { | } ~ del */
3014 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003015};
3016
Antoine Pitrou244651a2009-05-04 18:56:13 +00003017/* ENCODE_DIRECT: this character should be encoded as itself. The
3018 * answer depends on whether we are encoding set O as itself, and also
3019 * on whether we are encoding whitespace as itself. RFC2152 makes it
3020 * clear that the answers to these questions vary between
3021 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003022
Antoine Pitrou244651a2009-05-04 18:56:13 +00003023#define ENCODE_DIRECT(c, directO, directWS) \
3024 ((c) < 128 && (c) > 0 && \
3025 ((utf7_category[(c)] == 0) || \
3026 (directWS && (utf7_category[(c)] == 2)) || \
3027 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003028
Alexander Belopolsky40018472011-02-26 01:02:56 +00003029PyObject *
3030PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003031 Py_ssize_t size,
3032 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003033{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003034 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3035}
3036
Antoine Pitrou244651a2009-05-04 18:56:13 +00003037/* The decoder. The only state we preserve is our read position,
3038 * i.e. how many characters we have consumed. So if we end in the
3039 * middle of a shift sequence we have to back off the read position
3040 * and the output to the beginning of the sequence, otherwise we lose
3041 * all the shift state (seen bits, number of bits seen, high
3042 * surrogate). */
3043
Alexander Belopolsky40018472011-02-26 01:02:56 +00003044PyObject *
3045PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003046 Py_ssize_t size,
3047 const char *errors,
3048 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003049{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003051 Py_ssize_t startinpos;
3052 Py_ssize_t endinpos;
3053 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003054 const char *e;
3055 PyUnicodeObject *unicode;
3056 Py_UNICODE *p;
3057 const char *errmsg = "";
3058 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003059 Py_UNICODE *shiftOutStart;
3060 unsigned int base64bits = 0;
3061 unsigned long base64buffer = 0;
3062 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063 PyObject *errorHandler = NULL;
3064 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003065
3066 unicode = _PyUnicode_New(size);
3067 if (!unicode)
3068 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003069 if (size == 0) {
3070 if (consumed)
3071 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003072 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003073 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003075 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003076 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003077 e = s + size;
3078
3079 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003080 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003081 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003082 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003083
Antoine Pitrou244651a2009-05-04 18:56:13 +00003084 if (inShift) { /* in a base-64 section */
3085 if (IS_BASE64(ch)) { /* consume a base-64 character */
3086 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3087 base64bits += 6;
3088 s++;
3089 if (base64bits >= 16) {
3090 /* we have enough bits for a UTF-16 value */
3091 Py_UNICODE outCh = (Py_UNICODE)
3092 (base64buffer >> (base64bits-16));
3093 base64bits -= 16;
3094 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3095 if (surrogate) {
3096 /* expecting a second surrogate */
3097 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3098#ifdef Py_UNICODE_WIDE
3099 *p++ = (((surrogate & 0x3FF)<<10)
3100 | (outCh & 0x3FF)) + 0x10000;
3101#else
3102 *p++ = surrogate;
3103 *p++ = outCh;
3104#endif
3105 surrogate = 0;
3106 }
3107 else {
3108 surrogate = 0;
3109 errmsg = "second surrogate missing";
3110 goto utf7Error;
3111 }
3112 }
3113 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3114 /* first surrogate */
3115 surrogate = outCh;
3116 }
3117 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3118 errmsg = "unexpected second surrogate";
3119 goto utf7Error;
3120 }
3121 else {
3122 *p++ = outCh;
3123 }
3124 }
3125 }
3126 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003127 inShift = 0;
3128 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003129 if (surrogate) {
3130 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003131 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003132 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003133 if (base64bits > 0) { /* left-over bits */
3134 if (base64bits >= 6) {
3135 /* We've seen at least one base-64 character */
3136 errmsg = "partial character in shift sequence";
3137 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003138 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003139 else {
3140 /* Some bits remain; they should be zero */
3141 if (base64buffer != 0) {
3142 errmsg = "non-zero padding bits in shift sequence";
3143 goto utf7Error;
3144 }
3145 }
3146 }
3147 if (ch != '-') {
3148 /* '-' is absorbed; other terminating
3149 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003150 *p++ = ch;
3151 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003152 }
3153 }
3154 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003155 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003156 s++; /* consume '+' */
3157 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003158 s++;
3159 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003160 }
3161 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003162 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003163 shiftOutStart = p;
3164 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003165 }
3166 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003167 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003168 *p++ = ch;
3169 s++;
3170 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003171 else {
3172 startinpos = s-starts;
3173 s++;
3174 errmsg = "unexpected special character";
3175 goto utf7Error;
3176 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003177 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003178utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003179 outpos = p-PyUnicode_AS_UNICODE(unicode);
3180 endinpos = s-starts;
3181 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003182 errors, &errorHandler,
3183 "utf7", errmsg,
3184 &starts, &e, &startinpos, &endinpos, &exc, &s,
3185 &unicode, &outpos, &p))
3186 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003187 }
3188
Antoine Pitrou244651a2009-05-04 18:56:13 +00003189 /* end of string */
3190
3191 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3192 /* if we're in an inconsistent state, that's an error */
3193 if (surrogate ||
3194 (base64bits >= 6) ||
3195 (base64bits > 0 && base64buffer != 0)) {
3196 outpos = p-PyUnicode_AS_UNICODE(unicode);
3197 endinpos = size;
3198 if (unicode_decode_call_errorhandler(
3199 errors, &errorHandler,
3200 "utf7", "unterminated shift sequence",
3201 &starts, &e, &startinpos, &endinpos, &exc, &s,
3202 &unicode, &outpos, &p))
3203 goto onError;
3204 if (s < e)
3205 goto restart;
3206 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003207 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003208
3209 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003210 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003211 if (inShift) {
3212 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003213 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003214 }
3215 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003216 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003217 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003218 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003219
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003220 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003221 goto onError;
3222
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003223 Py_XDECREF(errorHandler);
3224 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003225 if (PyUnicode_READY(unicode) == -1) {
3226 Py_DECREF(unicode);
3227 return NULL;
3228 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003229 return (PyObject *)unicode;
3230
Benjamin Peterson29060642009-01-31 22:14:21 +00003231 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 Py_XDECREF(errorHandler);
3233 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003234 Py_DECREF(unicode);
3235 return NULL;
3236}
3237
3238
Alexander Belopolsky40018472011-02-26 01:02:56 +00003239PyObject *
3240PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003241 Py_ssize_t size,
3242 int base64SetO,
3243 int base64WhiteSpace,
3244 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003245{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003246 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003247 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003248 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003249 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003250 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003251 unsigned int base64bits = 0;
3252 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003253 char * out;
3254 char * start;
3255
3256 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003257 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003258
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003259 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003260 return PyErr_NoMemory();
3261
Antoine Pitrou244651a2009-05-04 18:56:13 +00003262 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003263 if (v == NULL)
3264 return NULL;
3265
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003266 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003267 for (;i < size; ++i) {
3268 Py_UNICODE ch = s[i];
3269
Antoine Pitrou244651a2009-05-04 18:56:13 +00003270 if (inShift) {
3271 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3272 /* shifting out */
3273 if (base64bits) { /* output remaining bits */
3274 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3275 base64buffer = 0;
3276 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003277 }
3278 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003279 /* Characters not in the BASE64 set implicitly unshift the sequence
3280 so no '-' is required, except if the character is itself a '-' */
3281 if (IS_BASE64(ch) || ch == '-') {
3282 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003283 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003284 *out++ = (char) ch;
3285 }
3286 else {
3287 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003288 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003289 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003290 else { /* not in a shift sequence */
3291 if (ch == '+') {
3292 *out++ = '+';
3293 *out++ = '-';
3294 }
3295 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3296 *out++ = (char) ch;
3297 }
3298 else {
3299 *out++ = '+';
3300 inShift = 1;
3301 goto encode_char;
3302 }
3303 }
3304 continue;
3305encode_char:
3306#ifdef Py_UNICODE_WIDE
3307 if (ch >= 0x10000) {
3308 /* code first surrogate */
3309 base64bits += 16;
3310 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3311 while (base64bits >= 6) {
3312 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3313 base64bits -= 6;
3314 }
3315 /* prepare second surrogate */
3316 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3317 }
3318#endif
3319 base64bits += 16;
3320 base64buffer = (base64buffer << 16) | ch;
3321 while (base64bits >= 6) {
3322 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3323 base64bits -= 6;
3324 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003325 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003326 if (base64bits)
3327 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3328 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003329 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003330 if (_PyBytes_Resize(&v, out - start) < 0)
3331 return NULL;
3332 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003333}
3334
Antoine Pitrou244651a2009-05-04 18:56:13 +00003335#undef IS_BASE64
3336#undef FROM_BASE64
3337#undef TO_BASE64
3338#undef DECODE_DIRECT
3339#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003340
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341/* --- UTF-8 Codec -------------------------------------------------------- */
3342
Tim Petersced69f82003-09-16 20:30:58 +00003343static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003345 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3346 illegal prefix. See RFC 3629 for details */
3347 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3348 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003349 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3351 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3352 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3353 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003354 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3355 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3357 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003358 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3359 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3360 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3361 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3362 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363};
3364
Alexander Belopolsky40018472011-02-26 01:02:56 +00003365PyObject *
3366PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003367 Py_ssize_t size,
3368 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369{
Walter Dörwald69652032004-09-07 20:24:22 +00003370 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3371}
3372
Antoine Pitrouab868312009-01-10 15:40:25 +00003373/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3374#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3375
3376/* Mask to quickly check whether a C 'long' contains a
3377 non-ASCII, UTF8-encoded char. */
3378#if (SIZEOF_LONG == 8)
3379# define ASCII_CHAR_MASK 0x8080808080808080L
3380#elif (SIZEOF_LONG == 4)
3381# define ASCII_CHAR_MASK 0x80808080L
3382#else
3383# error C 'long' size should be either 4 or 8!
3384#endif
3385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003386/* Scans a UTF-8 string and returns the maximum character to be expected,
3387 the size of the decoded unicode string and if any major errors were
3388 encountered.
3389
3390 This function does check basic UTF-8 sanity, it does however NOT CHECK
3391 if the string contains surrogates, and if all continuation bytes are
3392 within the correct ranges, these checks are performed in
3393 PyUnicode_DecodeUTF8Stateful.
3394
3395 If it sets has_errors to 1, it means the value of unicode_size and max_char
3396 will be bogus and you should not rely on useful information in them.
3397 */
3398static Py_UCS4
3399utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3400 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3401 int *has_errors)
3402{
3403 Py_ssize_t n;
3404 Py_ssize_t char_count = 0;
3405 Py_UCS4 max_char = 127, new_max;
3406 Py_UCS4 upper_bound;
3407 const unsigned char *p = (const unsigned char *)s;
3408 const unsigned char *end = p + string_size;
3409 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3410 int err = 0;
3411
3412 for (; p < end && !err; ++p, ++char_count) {
3413 /* Only check value if it's not a ASCII char... */
3414 if (*p < 0x80) {
3415 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3416 an explanation. */
3417 if (!((size_t) p & LONG_PTR_MASK)) {
3418 /* Help register allocation */
3419 register const unsigned char *_p = p;
3420 while (_p < aligned_end) {
3421 unsigned long value = *(unsigned long *) _p;
3422 if (value & ASCII_CHAR_MASK)
3423 break;
3424 _p += SIZEOF_LONG;
3425 char_count += SIZEOF_LONG;
3426 }
3427 p = _p;
3428 if (p == end)
3429 break;
3430 }
3431 }
3432 if (*p >= 0x80) {
3433 n = utf8_code_length[*p];
3434 new_max = max_char;
3435 switch (n) {
3436 /* invalid start byte */
3437 case 0:
3438 err = 1;
3439 break;
3440 case 2:
3441 /* Code points between 0x00FF and 0x07FF inclusive.
3442 Approximate the upper bound of the code point,
3443 if this flips over 255 we can be sure it will be more
3444 than 255 and the string will need 2 bytes per code coint,
3445 if it stays under or equal to 255, we can be sure 1 byte
3446 is enough.
3447 ((*p & 0b00011111) << 6) | 0b00111111 */
3448 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3449 if (max_char < upper_bound)
3450 new_max = upper_bound;
3451 /* Ensure we track at least that we left ASCII space. */
3452 if (new_max < 128)
3453 new_max = 128;
3454 break;
3455 case 3:
3456 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3457 always > 255 and <= 65535 and will always need 2 bytes. */
3458 if (max_char < 65535)
3459 new_max = 65535;
3460 break;
3461 case 4:
3462 /* Code point will be above 0xFFFF for sure in this case. */
3463 new_max = 65537;
3464 break;
3465 /* Internal error, this should be caught by the first if */
3466 case 1:
3467 default:
3468 assert(0 && "Impossible case in utf8_max_char_and_size");
3469 err = 1;
3470 }
3471 /* Instead of number of overall bytes for this code point,
3472 n containts the number of following bytes: */
3473 --n;
3474 /* Check if the follow up chars are all valid continuation bytes */
3475 if (n >= 1) {
3476 const unsigned char *cont;
3477 if ((p + n) >= end) {
3478 if (consumed == 0)
3479 /* incomplete data, non-incremental decoding */
3480 err = 1;
3481 break;
3482 }
3483 for (cont = p + 1; cont < (p + n); ++cont) {
3484 if ((*cont & 0xc0) != 0x80) {
3485 err = 1;
3486 break;
3487 }
3488 }
3489 p += n;
3490 }
3491 else
3492 err = 1;
3493 max_char = new_max;
3494 }
3495 }
3496
3497 if (unicode_size)
3498 *unicode_size = char_count;
3499 if (has_errors)
3500 *has_errors = err;
3501 return max_char;
3502}
3503
3504/* Similar to PyUnicode_WRITE but can also write into wstr field
3505 of the legacy unicode representation */
3506#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3507 do { \
3508 const int k_ = (kind); \
3509 if (k_ == PyUnicode_WCHAR_KIND) \
3510 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3511 else if (k_ == PyUnicode_1BYTE_KIND) \
3512 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3513 else if (k_ == PyUnicode_2BYTE_KIND) \
3514 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3515 else \
3516 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3517 } while (0)
3518
Alexander Belopolsky40018472011-02-26 01:02:56 +00003519PyObject *
3520PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003521 Py_ssize_t size,
3522 const char *errors,
3523 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003527 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003528 Py_ssize_t startinpos;
3529 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003530 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003532 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 PyObject *errorHandler = NULL;
3534 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003535 Py_UCS4 maxchar = 0;
3536 Py_ssize_t unicode_size;
3537 Py_ssize_t i;
3538 int kind;
3539 void *data;
3540 int has_errors;
3541 Py_UNICODE *error_outptr;
3542#if SIZEOF_WCHAR_T == 2
3543 Py_ssize_t wchar_offset = 0;
3544#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545
Walter Dörwald69652032004-09-07 20:24:22 +00003546 if (size == 0) {
3547 if (consumed)
3548 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003549 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003551 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3552 consumed, &has_errors);
3553 if (has_errors) {
3554 unicode = _PyUnicode_New(size);
3555 if (!unicode)
3556 return NULL;
3557 kind = PyUnicode_WCHAR_KIND;
3558 data = PyUnicode_AS_UNICODE(unicode);
3559 assert(data != NULL);
3560 }
3561 else {
3562 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3563 if (!unicode)
3564 return NULL;
3565 /* When the string is ASCII only, just use memcpy and return.
3566 unicode_size may be != size if there is an incomplete UTF-8
3567 sequence at the end of the ASCII block. */
3568 if (maxchar < 128 && size == unicode_size) {
3569 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3570 return (PyObject *)unicode;
3571 }
3572 kind = PyUnicode_KIND(unicode);
3573 data = PyUnicode_DATA(unicode);
3574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003576 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003578 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579
3580 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003581 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582
3583 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003584 /* Fast path for runs of ASCII characters. Given that common UTF-8
3585 input will consist of an overwhelming majority of ASCII
3586 characters, we try to optimize for this case by checking
3587 as many characters as a C 'long' can contain.
3588 First, check if we can do an aligned read, as most CPUs have
3589 a penalty for unaligned reads.
3590 */
3591 if (!((size_t) s & LONG_PTR_MASK)) {
3592 /* Help register allocation */
3593 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003594 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003595 while (_s < aligned_end) {
3596 /* Read a whole long at a time (either 4 or 8 bytes),
3597 and do a fast unrolled copy if it only contains ASCII
3598 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003599 unsigned long value = *(unsigned long *) _s;
3600 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003601 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003602 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3603 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3604 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3605 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003606#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003607 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3608 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3609 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3610 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003611#endif
3612 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003613 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003614 }
3615 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003616 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003617 if (s == e)
3618 break;
3619 ch = (unsigned char)*s;
3620 }
3621 }
3622
3623 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003624 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625 s++;
3626 continue;
3627 }
3628
3629 n = utf8_code_length[ch];
3630
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003631 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003632 if (consumed)
3633 break;
3634 else {
3635 errmsg = "unexpected end of data";
3636 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003637 endinpos = startinpos+1;
3638 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3639 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003640 goto utf8Error;
3641 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643
3644 switch (n) {
3645
3646 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003647 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003648 startinpos = s-starts;
3649 endinpos = startinpos+1;
3650 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651
3652 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003653 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003654 startinpos = s-starts;
3655 endinpos = startinpos+1;
3656 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657
3658 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003659 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003660 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003661 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003662 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003663 goto utf8Error;
3664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003666 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003667 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 break;
3669
3670 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003671 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3672 will result in surrogates in range d800-dfff. Surrogates are
3673 not valid UTF-8 so they are rejected.
3674 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3675 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003676 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003677 (s[2] & 0xc0) != 0x80 ||
3678 ((unsigned char)s[0] == 0xE0 &&
3679 (unsigned char)s[1] < 0xA0) ||
3680 ((unsigned char)s[0] == 0xED &&
3681 (unsigned char)s[1] > 0x9F)) {
3682 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003684 endinpos = startinpos + 1;
3685
3686 /* if s[1] first two bits are 1 and 0, then the invalid
3687 continuation byte is s[2], so increment endinpos by 1,
3688 if not, s[1] is invalid and endinpos doesn't need to
3689 be incremented. */
3690 if ((s[1] & 0xC0) == 0x80)
3691 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003692 goto utf8Error;
3693 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003695 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003696 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003697 break;
3698
3699 case 4:
3700 if ((s[1] & 0xc0) != 0x80 ||
3701 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003702 (s[3] & 0xc0) != 0x80 ||
3703 ((unsigned char)s[0] == 0xF0 &&
3704 (unsigned char)s[1] < 0x90) ||
3705 ((unsigned char)s[0] == 0xF4 &&
3706 (unsigned char)s[1] > 0x8F)) {
3707 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003708 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003709 endinpos = startinpos + 1;
3710 if ((s[1] & 0xC0) == 0x80) {
3711 endinpos++;
3712 if ((s[2] & 0xC0) == 0x80)
3713 endinpos++;
3714 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003715 goto utf8Error;
3716 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003717 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003718 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3719 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003721 /* If the string is flexible or we have native UCS-4, write
3722 directly.. */
3723 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3724 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003726 else {
3727 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729 /* translate from 10000..10FFFF to 0..FFFF */
3730 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003732 /* high surrogate = top 10 bits added to D800 */
3733 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3734 (Py_UNICODE)(0xD800 + (ch >> 10)));
3735
3736 /* low surrogate = bottom 10 bits added to DC00 */
3737 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3738 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3739 }
3740#if SIZEOF_WCHAR_T == 2
3741 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003742#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 }
3745 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003746 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003747
Benjamin Peterson29060642009-01-31 22:14:21 +00003748 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003749 /* If this is not yet a resizable string, make it one.. */
3750 if (kind != PyUnicode_WCHAR_KIND) {
3751 const Py_UNICODE *u;
3752 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3753 if (!new_unicode)
3754 goto onError;
3755 u = PyUnicode_AsUnicode((PyObject *)unicode);
3756 if (!u)
3757 goto onError;
3758#if SIZEOF_WCHAR_T == 2
3759 i += wchar_offset;
3760#endif
3761 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3762 Py_DECREF(unicode);
3763 unicode = new_unicode;
3764 kind = 0;
3765 data = PyUnicode_AS_UNICODE(new_unicode);
3766 assert(data != NULL);
3767 }
3768 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003769 if (unicode_decode_call_errorhandler(
3770 errors, &errorHandler,
3771 "utf8", errmsg,
3772 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003774 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003775 /* Update data because unicode_decode_call_errorhandler might have
3776 re-created or resized the unicode object. */
3777 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003778 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003780 /* Ensure the unicode_size calculation above was correct: */
3781 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3782
Walter Dörwald69652032004-09-07 20:24:22 +00003783 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003784 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003786 /* Adjust length and ready string when it contained errors and
3787 is of the old resizable kind. */
3788 if (kind == PyUnicode_WCHAR_KIND) {
3789 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3790 PyUnicode_READY(unicode) == -1)
3791 goto onError;
3792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003794 Py_XDECREF(errorHandler);
3795 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796 if (PyUnicode_READY(unicode) == -1) {
3797 Py_DECREF(unicode);
3798 return NULL;
3799 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800 return (PyObject *)unicode;
3801
Benjamin Peterson29060642009-01-31 22:14:21 +00003802 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003803 Py_XDECREF(errorHandler);
3804 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 Py_DECREF(unicode);
3806 return NULL;
3807}
3808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003810
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003811#ifdef __APPLE__
3812
3813/* Simplified UTF-8 decoder using surrogateescape error handler,
3814 used to decode the command line arguments on Mac OS X. */
3815
3816wchar_t*
3817_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3818{
3819 int n;
3820 const char *e;
3821 wchar_t *unicode, *p;
3822
3823 /* Note: size will always be longer than the resulting Unicode
3824 character count */
3825 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3826 PyErr_NoMemory();
3827 return NULL;
3828 }
3829 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3830 if (!unicode)
3831 return NULL;
3832
3833 /* Unpack UTF-8 encoded data */
3834 p = unicode;
3835 e = s + size;
3836 while (s < e) {
3837 Py_UCS4 ch = (unsigned char)*s;
3838
3839 if (ch < 0x80) {
3840 *p++ = (wchar_t)ch;
3841 s++;
3842 continue;
3843 }
3844
3845 n = utf8_code_length[ch];
3846 if (s + n > e) {
3847 goto surrogateescape;
3848 }
3849
3850 switch (n) {
3851 case 0:
3852 case 1:
3853 goto surrogateescape;
3854
3855 case 2:
3856 if ((s[1] & 0xc0) != 0x80)
3857 goto surrogateescape;
3858 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3859 assert ((ch > 0x007F) && (ch <= 0x07FF));
3860 *p++ = (wchar_t)ch;
3861 break;
3862
3863 case 3:
3864 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3865 will result in surrogates in range d800-dfff. Surrogates are
3866 not valid UTF-8 so they are rejected.
3867 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3868 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3869 if ((s[1] & 0xc0) != 0x80 ||
3870 (s[2] & 0xc0) != 0x80 ||
3871 ((unsigned char)s[0] == 0xE0 &&
3872 (unsigned char)s[1] < 0xA0) ||
3873 ((unsigned char)s[0] == 0xED &&
3874 (unsigned char)s[1] > 0x9F)) {
3875
3876 goto surrogateescape;
3877 }
3878 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3879 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003881 break;
3882
3883 case 4:
3884 if ((s[1] & 0xc0) != 0x80 ||
3885 (s[2] & 0xc0) != 0x80 ||
3886 (s[3] & 0xc0) != 0x80 ||
3887 ((unsigned char)s[0] == 0xF0 &&
3888 (unsigned char)s[1] < 0x90) ||
3889 ((unsigned char)s[0] == 0xF4 &&
3890 (unsigned char)s[1] > 0x8F)) {
3891 goto surrogateescape;
3892 }
3893 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3894 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3895 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3896
3897#if SIZEOF_WCHAR_T == 4
3898 *p++ = (wchar_t)ch;
3899#else
3900 /* compute and append the two surrogates: */
3901
3902 /* translate from 10000..10FFFF to 0..FFFF */
3903 ch -= 0x10000;
3904
3905 /* high surrogate = top 10 bits added to D800 */
3906 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3907
3908 /* low surrogate = bottom 10 bits added to DC00 */
3909 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3910#endif
3911 break;
3912 }
3913 s += n;
3914 continue;
3915
3916 surrogateescape:
3917 *p++ = 0xDC00 + ch;
3918 s++;
3919 }
3920 *p = L'\0';
3921 return unicode;
3922}
3923
3924#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926/* Primary internal function which creates utf8 encoded bytes objects.
3927
3928 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003929 and allocate exactly as much space needed at the end. Else allocate the
3930 maximum possible needed (4 result bytes per Unicode character), and return
3931 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003932*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003933PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935{
Tim Peters602f7402002-04-27 18:03:26 +00003936#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003937
Guido van Rossum98297ee2007-11-06 21:34:58 +00003938 Py_ssize_t i; /* index into s of next input byte */
3939 PyObject *result; /* result string object */
3940 char *p; /* next free byte in output buffer */
3941 Py_ssize_t nallocated; /* number of result bytes allocated */
3942 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003943 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003944 PyObject *errorHandler = NULL;
3945 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003946 int kind;
3947 void *data;
3948 Py_ssize_t size;
3949 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3950#if SIZEOF_WCHAR_T == 2
3951 Py_ssize_t wchar_offset = 0;
3952#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954 if (!PyUnicode_Check(unicode)) {
3955 PyErr_BadArgument();
3956 return NULL;
3957 }
3958
3959 if (PyUnicode_READY(unicode) == -1)
3960 return NULL;
3961
3962 if (_PyUnicode_UTF8(unicode))
3963 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
3964 _PyUnicode_UTF8_LENGTH(unicode));
3965
3966 kind = PyUnicode_KIND(unicode);
3967 data = PyUnicode_DATA(unicode);
3968 size = PyUnicode_GET_LENGTH(unicode);
3969
Tim Peters602f7402002-04-27 18:03:26 +00003970 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971
Tim Peters602f7402002-04-27 18:03:26 +00003972 if (size <= MAX_SHORT_UNICHARS) {
3973 /* Write into the stack buffer; nallocated can't overflow.
3974 * At the end, we'll allocate exactly as much heap space as it
3975 * turns out we need.
3976 */
3977 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003978 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00003979 p = stackbuf;
3980 }
3981 else {
3982 /* Overallocate on the heap, and give the excess back at the end. */
3983 nallocated = size * 4;
3984 if (nallocated / 4 != size) /* overflow! */
3985 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00003986 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003987 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00003988 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003989 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003990 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003991
Tim Peters602f7402002-04-27 18:03:26 +00003992 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003994
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003995 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00003996 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003998
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004000 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004001 *p++ = (char)(0xc0 | (ch >> 6));
4002 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004003 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004004 Py_ssize_t newpos;
4005 PyObject *rep;
4006 Py_ssize_t repsize, k, startpos;
4007 startpos = i-1;
4008#if SIZEOF_WCHAR_T == 2
4009 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004010#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 rep = unicode_encode_call_errorhandler(
4012 errors, &errorHandler, "utf-8", "surrogates not allowed",
4013 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4014 &exc, startpos, startpos+1, &newpos);
4015 if (!rep)
4016 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 if (PyBytes_Check(rep))
4019 repsize = PyBytes_GET_SIZE(rep);
4020 else
4021 repsize = PyUnicode_GET_SIZE(rep);
4022
4023 if (repsize > 4) {
4024 Py_ssize_t offset;
4025
4026 if (result == NULL)
4027 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004028 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004029 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004031 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4032 /* integer overflow */
4033 PyErr_NoMemory();
4034 goto error;
4035 }
4036 nallocated += repsize - 4;
4037 if (result != NULL) {
4038 if (_PyBytes_Resize(&result, nallocated) < 0)
4039 goto error;
4040 } else {
4041 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004042 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043 goto error;
4044 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4045 }
4046 p = PyBytes_AS_STRING(result) + offset;
4047 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 if (PyBytes_Check(rep)) {
4050 char *prep = PyBytes_AS_STRING(rep);
4051 for(k = repsize; k > 0; k--)
4052 *p++ = *prep++;
4053 } else /* rep is unicode */ {
4054 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4055 Py_UNICODE c;
4056
4057 for(k=0; k<repsize; k++) {
4058 c = prep[k];
4059 if (0x80 <= c) {
4060 raise_encode_exception(&exc, "utf-8",
4061 PyUnicode_AS_UNICODE(unicode),
4062 size, i-1, i,
4063 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004064 goto error;
4065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004067 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004070 } else if (ch < 0x10000) {
4071 *p++ = (char)(0xe0 | (ch >> 12));
4072 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4073 *p++ = (char)(0x80 | (ch & 0x3f));
4074 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004075 /* Encode UCS4 Unicode ordinals */
4076 *p++ = (char)(0xf0 | (ch >> 18));
4077 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4078 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4079 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080#if SIZEOF_WCHAR_T == 2
4081 wchar_offset++;
4082#endif
Tim Peters602f7402002-04-27 18:03:26 +00004083 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004085
Guido van Rossum98297ee2007-11-06 21:34:58 +00004086 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004087 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004088 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004089 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004090 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004091 }
4092 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004093 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004094 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004095 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004096 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004098
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004099 Py_XDECREF(errorHandler);
4100 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004101 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004102 error:
4103 Py_XDECREF(errorHandler);
4104 Py_XDECREF(exc);
4105 Py_XDECREF(result);
4106 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004107
Tim Peters602f7402002-04-27 18:03:26 +00004108#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109}
4110
Alexander Belopolsky40018472011-02-26 01:02:56 +00004111PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004112PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4113 Py_ssize_t size,
4114 const char *errors)
4115{
4116 PyObject *v, *unicode;
4117
4118 unicode = PyUnicode_FromUnicode(s, size);
4119 if (unicode == NULL)
4120 return NULL;
4121 v = _PyUnicode_AsUTF8String(unicode, errors);
4122 Py_DECREF(unicode);
4123 return v;
4124}
4125
4126PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004127PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004129 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130}
4131
Walter Dörwald41980ca2007-08-16 21:55:45 +00004132/* --- UTF-32 Codec ------------------------------------------------------- */
4133
4134PyObject *
4135PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 Py_ssize_t size,
4137 const char *errors,
4138 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004139{
4140 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4141}
4142
4143PyObject *
4144PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 Py_ssize_t size,
4146 const char *errors,
4147 int *byteorder,
4148 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004149{
4150 const char *starts = s;
4151 Py_ssize_t startinpos;
4152 Py_ssize_t endinpos;
4153 Py_ssize_t outpos;
4154 PyUnicodeObject *unicode;
4155 Py_UNICODE *p;
4156#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004157 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004158 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004159#else
4160 const int pairs = 0;
4161#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004162 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004163 int bo = 0; /* assume native ordering by default */
4164 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004165 /* Offsets from q for retrieving bytes in the right order. */
4166#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4167 int iorder[] = {0, 1, 2, 3};
4168#else
4169 int iorder[] = {3, 2, 1, 0};
4170#endif
4171 PyObject *errorHandler = NULL;
4172 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004173
Walter Dörwald41980ca2007-08-16 21:55:45 +00004174 q = (unsigned char *)s;
4175 e = q + size;
4176
4177 if (byteorder)
4178 bo = *byteorder;
4179
4180 /* Check for BOM marks (U+FEFF) in the input and adjust current
4181 byte order setting accordingly. In native mode, the leading BOM
4182 mark is skipped, in all other modes, it is copied to the output
4183 stream as-is (giving a ZWNBSP character). */
4184 if (bo == 0) {
4185 if (size >= 4) {
4186 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004187 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004188#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 if (bom == 0x0000FEFF) {
4190 q += 4;
4191 bo = -1;
4192 }
4193 else if (bom == 0xFFFE0000) {
4194 q += 4;
4195 bo = 1;
4196 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004197#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 if (bom == 0x0000FEFF) {
4199 q += 4;
4200 bo = 1;
4201 }
4202 else if (bom == 0xFFFE0000) {
4203 q += 4;
4204 bo = -1;
4205 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004206#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004208 }
4209
4210 if (bo == -1) {
4211 /* force LE */
4212 iorder[0] = 0;
4213 iorder[1] = 1;
4214 iorder[2] = 2;
4215 iorder[3] = 3;
4216 }
4217 else if (bo == 1) {
4218 /* force BE */
4219 iorder[0] = 3;
4220 iorder[1] = 2;
4221 iorder[2] = 1;
4222 iorder[3] = 0;
4223 }
4224
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004225 /* On narrow builds we split characters outside the BMP into two
4226 codepoints => count how much extra space we need. */
4227#ifndef Py_UNICODE_WIDE
4228 for (qq = q; qq < e; qq += 4)
4229 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4230 pairs++;
4231#endif
4232
4233 /* This might be one to much, because of a BOM */
4234 unicode = _PyUnicode_New((size+3)/4+pairs);
4235 if (!unicode)
4236 return NULL;
4237 if (size == 0)
4238 return (PyObject *)unicode;
4239
4240 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004241 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004242
Walter Dörwald41980ca2007-08-16 21:55:45 +00004243 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 Py_UCS4 ch;
4245 /* remaining bytes at the end? (size should be divisible by 4) */
4246 if (e-q<4) {
4247 if (consumed)
4248 break;
4249 errmsg = "truncated data";
4250 startinpos = ((const char *)q)-starts;
4251 endinpos = ((const char *)e)-starts;
4252 goto utf32Error;
4253 /* The remaining input chars are ignored if the callback
4254 chooses to skip the input */
4255 }
4256 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4257 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004258
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 if (ch >= 0x110000)
4260 {
4261 errmsg = "codepoint not in range(0x110000)";
4262 startinpos = ((const char *)q)-starts;
4263 endinpos = startinpos+4;
4264 goto utf32Error;
4265 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004266#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004267 if (ch >= 0x10000)
4268 {
4269 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4270 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4271 }
4272 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004273#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 *p++ = ch;
4275 q += 4;
4276 continue;
4277 utf32Error:
4278 outpos = p-PyUnicode_AS_UNICODE(unicode);
4279 if (unicode_decode_call_errorhandler(
4280 errors, &errorHandler,
4281 "utf32", errmsg,
4282 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4283 &unicode, &outpos, &p))
4284 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004285 }
4286
4287 if (byteorder)
4288 *byteorder = bo;
4289
4290 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004292
4293 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004295 goto onError;
4296
4297 Py_XDECREF(errorHandler);
4298 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004299 if (PyUnicode_READY(unicode) == -1) {
4300 Py_DECREF(unicode);
4301 return NULL;
4302 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004303 return (PyObject *)unicode;
4304
Benjamin Peterson29060642009-01-31 22:14:21 +00004305 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004306 Py_DECREF(unicode);
4307 Py_XDECREF(errorHandler);
4308 Py_XDECREF(exc);
4309 return NULL;
4310}
4311
4312PyObject *
4313PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 Py_ssize_t size,
4315 const char *errors,
4316 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004317{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004318 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004319 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004320 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004321#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004322 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004323#else
4324 const int pairs = 0;
4325#endif
4326 /* Offsets from p for storing byte pairs in the right order. */
4327#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4328 int iorder[] = {0, 1, 2, 3};
4329#else
4330 int iorder[] = {3, 2, 1, 0};
4331#endif
4332
Benjamin Peterson29060642009-01-31 22:14:21 +00004333#define STORECHAR(CH) \
4334 do { \
4335 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4336 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4337 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4338 p[iorder[0]] = (CH) & 0xff; \
4339 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004340 } while(0)
4341
4342 /* In narrow builds we can output surrogate pairs as one codepoint,
4343 so we need less space. */
4344#ifndef Py_UNICODE_WIDE
4345 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004346 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4347 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4348 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004349#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004350 nsize = (size - pairs + (byteorder == 0));
4351 bytesize = nsize * 4;
4352 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004354 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004355 if (v == NULL)
4356 return NULL;
4357
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004358 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004359 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004360 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004361 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004362 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004363
4364 if (byteorder == -1) {
4365 /* force LE */
4366 iorder[0] = 0;
4367 iorder[1] = 1;
4368 iorder[2] = 2;
4369 iorder[3] = 3;
4370 }
4371 else if (byteorder == 1) {
4372 /* force BE */
4373 iorder[0] = 3;
4374 iorder[1] = 2;
4375 iorder[2] = 1;
4376 iorder[3] = 0;
4377 }
4378
4379 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004380 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004381#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004382 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4383 Py_UCS4 ch2 = *s;
4384 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4385 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4386 s++;
4387 size--;
4388 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004389 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004390#endif
4391 STORECHAR(ch);
4392 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004393
4394 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004395 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004396#undef STORECHAR
4397}
4398
Alexander Belopolsky40018472011-02-26 01:02:56 +00004399PyObject *
4400PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004401{
4402 if (!PyUnicode_Check(unicode)) {
4403 PyErr_BadArgument();
4404 return NULL;
4405 }
4406 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 PyUnicode_GET_SIZE(unicode),
4408 NULL,
4409 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004410}
4411
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412/* --- UTF-16 Codec ------------------------------------------------------- */
4413
Tim Peters772747b2001-08-09 22:21:55 +00004414PyObject *
4415PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004416 Py_ssize_t size,
4417 const char *errors,
4418 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419{
Walter Dörwald69652032004-09-07 20:24:22 +00004420 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4421}
4422
Antoine Pitrouab868312009-01-10 15:40:25 +00004423/* Two masks for fast checking of whether a C 'long' may contain
4424 UTF16-encoded surrogate characters. This is an efficient heuristic,
4425 assuming that non-surrogate characters with a code point >= 0x8000 are
4426 rare in most input.
4427 FAST_CHAR_MASK is used when the input is in native byte ordering,
4428 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004429*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004430#if (SIZEOF_LONG == 8)
4431# define FAST_CHAR_MASK 0x8000800080008000L
4432# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4433#elif (SIZEOF_LONG == 4)
4434# define FAST_CHAR_MASK 0x80008000L
4435# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4436#else
4437# error C 'long' size should be either 4 or 8!
4438#endif
4439
Walter Dörwald69652032004-09-07 20:24:22 +00004440PyObject *
4441PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 Py_ssize_t size,
4443 const char *errors,
4444 int *byteorder,
4445 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004446{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004448 Py_ssize_t startinpos;
4449 Py_ssize_t endinpos;
4450 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 PyUnicodeObject *unicode;
4452 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004453 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004454 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004455 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004456 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004457 /* Offsets from q for retrieving byte pairs in the right order. */
4458#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4459 int ihi = 1, ilo = 0;
4460#else
4461 int ihi = 0, ilo = 1;
4462#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463 PyObject *errorHandler = NULL;
4464 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465
4466 /* Note: size will always be longer than the resulting Unicode
4467 character count */
4468 unicode = _PyUnicode_New(size);
4469 if (!unicode)
4470 return NULL;
4471 if (size == 0)
4472 return (PyObject *)unicode;
4473
4474 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004475 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004476 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004477 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478
4479 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004480 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004482 /* Check for BOM marks (U+FEFF) in the input and adjust current
4483 byte order setting accordingly. In native mode, the leading BOM
4484 mark is skipped, in all other modes, it is copied to the output
4485 stream as-is (giving a ZWNBSP character). */
4486 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004487 if (size >= 2) {
4488 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004489#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 if (bom == 0xFEFF) {
4491 q += 2;
4492 bo = -1;
4493 }
4494 else if (bom == 0xFFFE) {
4495 q += 2;
4496 bo = 1;
4497 }
Tim Petersced69f82003-09-16 20:30:58 +00004498#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004499 if (bom == 0xFEFF) {
4500 q += 2;
4501 bo = 1;
4502 }
4503 else if (bom == 0xFFFE) {
4504 q += 2;
4505 bo = -1;
4506 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004507#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510
Tim Peters772747b2001-08-09 22:21:55 +00004511 if (bo == -1) {
4512 /* force LE */
4513 ihi = 1;
4514 ilo = 0;
4515 }
4516 else if (bo == 1) {
4517 /* force BE */
4518 ihi = 0;
4519 ilo = 1;
4520 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004521#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4522 native_ordering = ilo < ihi;
4523#else
4524 native_ordering = ilo > ihi;
4525#endif
Tim Peters772747b2001-08-09 22:21:55 +00004526
Antoine Pitrouab868312009-01-10 15:40:25 +00004527 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004528 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004530 /* First check for possible aligned read of a C 'long'. Unaligned
4531 reads are more expensive, better to defer to another iteration. */
4532 if (!((size_t) q & LONG_PTR_MASK)) {
4533 /* Fast path for runs of non-surrogate chars. */
4534 register const unsigned char *_q = q;
4535 Py_UNICODE *_p = p;
4536 if (native_ordering) {
4537 /* Native ordering is simple: as long as the input cannot
4538 possibly contain a surrogate char, do an unrolled copy
4539 of several 16-bit code points to the target object.
4540 The non-surrogate check is done on several input bytes
4541 at a time (as many as a C 'long' can contain). */
4542 while (_q < aligned_end) {
4543 unsigned long data = * (unsigned long *) _q;
4544 if (data & FAST_CHAR_MASK)
4545 break;
4546 _p[0] = ((unsigned short *) _q)[0];
4547 _p[1] = ((unsigned short *) _q)[1];
4548#if (SIZEOF_LONG == 8)
4549 _p[2] = ((unsigned short *) _q)[2];
4550 _p[3] = ((unsigned short *) _q)[3];
4551#endif
4552 _q += SIZEOF_LONG;
4553 _p += SIZEOF_LONG / 2;
4554 }
4555 }
4556 else {
4557 /* Byteswapped ordering is similar, but we must decompose
4558 the copy bytewise, and take care of zero'ing out the
4559 upper bytes if the target object is in 32-bit units
4560 (that is, in UCS-4 builds). */
4561 while (_q < aligned_end) {
4562 unsigned long data = * (unsigned long *) _q;
4563 if (data & SWAPPED_FAST_CHAR_MASK)
4564 break;
4565 /* Zero upper bytes in UCS-4 builds */
4566#if (Py_UNICODE_SIZE > 2)
4567 _p[0] = 0;
4568 _p[1] = 0;
4569#if (SIZEOF_LONG == 8)
4570 _p[2] = 0;
4571 _p[3] = 0;
4572#endif
4573#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004574 /* Issue #4916; UCS-4 builds on big endian machines must
4575 fill the two last bytes of each 4-byte unit. */
4576#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4577# define OFF 2
4578#else
4579# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004580#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004581 ((unsigned char *) _p)[OFF + 1] = _q[0];
4582 ((unsigned char *) _p)[OFF + 0] = _q[1];
4583 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4584 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4585#if (SIZEOF_LONG == 8)
4586 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4587 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4588 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4589 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4590#endif
4591#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004592 _q += SIZEOF_LONG;
4593 _p += SIZEOF_LONG / 2;
4594 }
4595 }
4596 p = _p;
4597 q = _q;
4598 if (q >= e)
4599 break;
4600 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602
Benjamin Peterson14339b62009-01-31 16:36:08 +00004603 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004604
4605 if (ch < 0xD800 || ch > 0xDFFF) {
4606 *p++ = ch;
4607 continue;
4608 }
4609
4610 /* UTF-16 code pair: */
4611 if (q > e) {
4612 errmsg = "unexpected end of data";
4613 startinpos = (((const char *)q) - 2) - starts;
4614 endinpos = ((const char *)e) + 1 - starts;
4615 goto utf16Error;
4616 }
4617 if (0xD800 <= ch && ch <= 0xDBFF) {
4618 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4619 q += 2;
4620 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004621#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004622 *p++ = ch;
4623 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004624#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004626#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 continue;
4628 }
4629 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004630 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004631 startinpos = (((const char *)q)-4)-starts;
4632 endinpos = startinpos+2;
4633 goto utf16Error;
4634 }
4635
Benjamin Peterson14339b62009-01-31 16:36:08 +00004636 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004637 errmsg = "illegal encoding";
4638 startinpos = (((const char *)q)-2)-starts;
4639 endinpos = startinpos+2;
4640 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004641
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 utf16Error:
4643 outpos = p - PyUnicode_AS_UNICODE(unicode);
4644 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004645 errors,
4646 &errorHandler,
4647 "utf16", errmsg,
4648 &starts,
4649 (const char **)&e,
4650 &startinpos,
4651 &endinpos,
4652 &exc,
4653 (const char **)&q,
4654 &unicode,
4655 &outpos,
4656 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004659 /* remaining byte at the end? (size should be even) */
4660 if (e == q) {
4661 if (!consumed) {
4662 errmsg = "truncated data";
4663 startinpos = ((const char *)q) - starts;
4664 endinpos = ((const char *)e) + 1 - starts;
4665 outpos = p - PyUnicode_AS_UNICODE(unicode);
4666 if (unicode_decode_call_errorhandler(
4667 errors,
4668 &errorHandler,
4669 "utf16", errmsg,
4670 &starts,
4671 (const char **)&e,
4672 &startinpos,
4673 &endinpos,
4674 &exc,
4675 (const char **)&q,
4676 &unicode,
4677 &outpos,
4678 &p))
4679 goto onError;
4680 /* The remaining input chars are ignored if the callback
4681 chooses to skip the input */
4682 }
4683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684
4685 if (byteorder)
4686 *byteorder = bo;
4687
Walter Dörwald69652032004-09-07 20:24:22 +00004688 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004689 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004690
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004692 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693 goto onError;
4694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695 Py_XDECREF(errorHandler);
4696 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004697 if (PyUnicode_READY(unicode) == -1) {
4698 Py_DECREF(unicode);
4699 return NULL;
4700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701 return (PyObject *)unicode;
4702
Benjamin Peterson29060642009-01-31 22:14:21 +00004703 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004705 Py_XDECREF(errorHandler);
4706 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707 return NULL;
4708}
4709
Antoine Pitrouab868312009-01-10 15:40:25 +00004710#undef FAST_CHAR_MASK
4711#undef SWAPPED_FAST_CHAR_MASK
4712
Tim Peters772747b2001-08-09 22:21:55 +00004713PyObject *
4714PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 Py_ssize_t size,
4716 const char *errors,
4717 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004719 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004720 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004721 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004722#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004723 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004724#else
4725 const int pairs = 0;
4726#endif
Tim Peters772747b2001-08-09 22:21:55 +00004727 /* Offsets from p for storing byte pairs in the right order. */
4728#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4729 int ihi = 1, ilo = 0;
4730#else
4731 int ihi = 0, ilo = 1;
4732#endif
4733
Benjamin Peterson29060642009-01-31 22:14:21 +00004734#define STORECHAR(CH) \
4735 do { \
4736 p[ihi] = ((CH) >> 8) & 0xff; \
4737 p[ilo] = (CH) & 0xff; \
4738 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004739 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004741#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004742 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 if (s[i] >= 0x10000)
4744 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004745#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004746 /* 2 * (size + pairs + (byteorder == 0)) */
4747 if (size > PY_SSIZE_T_MAX ||
4748 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004749 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004750 nsize = size + pairs + (byteorder == 0);
4751 bytesize = nsize * 2;
4752 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004753 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004754 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 if (v == NULL)
4756 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004758 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004760 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004761 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004762 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004763
4764 if (byteorder == -1) {
4765 /* force LE */
4766 ihi = 1;
4767 ilo = 0;
4768 }
4769 else if (byteorder == 1) {
4770 /* force BE */
4771 ihi = 0;
4772 ilo = 1;
4773 }
4774
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004775 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 Py_UNICODE ch = *s++;
4777 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004778#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 if (ch >= 0x10000) {
4780 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4781 ch = 0xD800 | ((ch-0x10000) >> 10);
4782 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004783#endif
Tim Peters772747b2001-08-09 22:21:55 +00004784 STORECHAR(ch);
4785 if (ch2)
4786 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004787 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004788
4789 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004790 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004791#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792}
4793
Alexander Belopolsky40018472011-02-26 01:02:56 +00004794PyObject *
4795PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796{
4797 if (!PyUnicode_Check(unicode)) {
4798 PyErr_BadArgument();
4799 return NULL;
4800 }
4801 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 PyUnicode_GET_SIZE(unicode),
4803 NULL,
4804 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805}
4806
4807/* --- Unicode Escape Codec ----------------------------------------------- */
4808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004809/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4810 if all the escapes in the string make it still a valid ASCII string.
4811 Returns -1 if any escapes were found which cause the string to
4812 pop out of ASCII range. Otherwise returns the length of the
4813 required buffer to hold the string.
4814 */
4815Py_ssize_t
4816length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4817{
4818 const unsigned char *p = (const unsigned char *)s;
4819 const unsigned char *end = p + size;
4820 Py_ssize_t length = 0;
4821
4822 if (size < 0)
4823 return -1;
4824
4825 for (; p < end; ++p) {
4826 if (*p > 127) {
4827 /* Non-ASCII */
4828 return -1;
4829 }
4830 else if (*p != '\\') {
4831 /* Normal character */
4832 ++length;
4833 }
4834 else {
4835 /* Backslash-escape, check next char */
4836 ++p;
4837 /* Escape sequence reaches till end of string or
4838 non-ASCII follow-up. */
4839 if (p >= end || *p > 127)
4840 return -1;
4841 switch (*p) {
4842 case '\n':
4843 /* backslash + \n result in zero characters */
4844 break;
4845 case '\\': case '\'': case '\"':
4846 case 'b': case 'f': case 't':
4847 case 'n': case 'r': case 'v': case 'a':
4848 ++length;
4849 break;
4850 case '0': case '1': case '2': case '3':
4851 case '4': case '5': case '6': case '7':
4852 case 'x': case 'u': case 'U': case 'N':
4853 /* these do not guarantee ASCII characters */
4854 return -1;
4855 default:
4856 /* count the backslash + the other character */
4857 length += 2;
4858 }
4859 }
4860 }
4861 return length;
4862}
4863
4864/* Similar to PyUnicode_WRITE but either write into wstr field
4865 or treat string as ASCII. */
4866#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4867 do { \
4868 if ((kind) != PyUnicode_WCHAR_KIND) \
4869 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4870 else \
4871 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4872 } while (0)
4873
4874#define WRITE_WSTR(buf, index, value) \
4875 assert(kind == PyUnicode_WCHAR_KIND), \
4876 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4877
4878
Fredrik Lundh06d12682001-01-24 07:59:11 +00004879static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004880
Alexander Belopolsky40018472011-02-26 01:02:56 +00004881PyObject *
4882PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004883 Py_ssize_t size,
4884 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004886 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004887 Py_ssize_t startinpos;
4888 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004889 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004893 char* message;
4894 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004895 PyObject *errorHandler = NULL;
4896 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004897 Py_ssize_t ascii_length;
4898 Py_ssize_t i;
4899 int kind;
4900 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004902 ascii_length = length_of_escaped_ascii_string(s, size);
4903
4904 /* After length_of_escaped_ascii_string() there are two alternatives,
4905 either the string is pure ASCII with named escapes like \n, etc.
4906 and we determined it's exact size (common case)
4907 or it contains \x, \u, ... escape sequences. then we create a
4908 legacy wchar string and resize it at the end of this function. */
4909 if (ascii_length >= 0) {
4910 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4911 if (!v)
4912 goto onError;
4913 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4914 kind = PyUnicode_1BYTE_KIND;
4915 data = PyUnicode_DATA(v);
4916 }
4917 else {
4918 /* Escaped strings will always be longer than the resulting
4919 Unicode string, so we start with size here and then reduce the
4920 length after conversion to the true value.
4921 (but if the error callback returns a long replacement string
4922 we'll have to allocate more space) */
4923 v = _PyUnicode_New(size);
4924 if (!v)
4925 goto onError;
4926 kind = PyUnicode_WCHAR_KIND;
4927 data = PyUnicode_AS_UNICODE(v);
4928 }
4929
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930 if (size == 0)
4931 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004932 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004934
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 while (s < end) {
4936 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004937 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004938 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004940 if (kind == PyUnicode_WCHAR_KIND) {
4941 assert(i < _PyUnicode_WSTR_LENGTH(v));
4942 }
4943 else {
4944 /* The only case in which i == ascii_length is a backslash
4945 followed by a newline. */
4946 assert(i <= ascii_length);
4947 }
4948
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949 /* Non-escape characters are interpreted as Unicode ordinals */
4950 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004951 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 continue;
4953 }
4954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 /* \ - Escapes */
4957 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004958 c = *s++;
4959 if (s > end)
4960 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004961
4962 if (kind == PyUnicode_WCHAR_KIND) {
4963 assert(i < _PyUnicode_WSTR_LENGTH(v));
4964 }
4965 else {
4966 /* The only case in which i == ascii_length is a backslash
4967 followed by a newline. */
4968 assert(i < ascii_length || (i == ascii_length && c == '\n'));
4969 }
4970
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004971 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004975 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
4976 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
4977 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
4978 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
4979 /* FF */
4980 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
4981 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
4982 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
4983 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
4984 /* VT */
4985 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
4986 /* BEL, not classic C */
4987 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 case '0': case '1': case '2': case '3':
4991 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004992 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004993 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004994 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004995 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004996 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004998 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999 break;
5000
Benjamin Peterson29060642009-01-31 22:14:21 +00005001 /* hex escapes */
5002 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005004 digits = 2;
5005 message = "truncated \\xXX escape";
5006 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007
Benjamin Peterson29060642009-01-31 22:14:21 +00005008 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005010 digits = 4;
5011 message = "truncated \\uXXXX escape";
5012 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013
Benjamin Peterson29060642009-01-31 22:14:21 +00005014 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005015 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005016 digits = 8;
5017 message = "truncated \\UXXXXXXXX escape";
5018 hexescape:
5019 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005020 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005021 if (s+digits>end) {
5022 endinpos = size;
5023 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 errors, &errorHandler,
5025 "unicodeescape", "end of string in escape sequence",
5026 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005027 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005028 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005029 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005030 goto nextByte;
5031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005032 for (j = 0; j < digits; ++j) {
5033 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005034 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005035 endinpos = (s+j+1)-starts;
5036 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005037 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 errors, &errorHandler,
5039 "unicodeescape", message,
5040 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005041 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005042 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005043 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005044 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005045 }
5046 chr = (chr<<4) & ~0xF;
5047 if (c >= '0' && c <= '9')
5048 chr += c - '0';
5049 else if (c >= 'a' && c <= 'f')
5050 chr += 10 + c - 'a';
5051 else
5052 chr += 10 + c - 'A';
5053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005054 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005055 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005056 /* _decoding_error will have already written into the
5057 target buffer. */
5058 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005059 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005060 /* when we get here, chr is a 32-bit unicode character */
5061 if (chr <= 0xffff)
5062 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005063 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005064 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005065 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005066 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005067#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005068 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005069#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005070 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005071 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5072 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005073#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005074 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005075 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005076 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005077 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 errors, &errorHandler,
5079 "unicodeescape", "illegal Unicode character",
5080 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005081 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005082 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005083 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005084 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005085 break;
5086
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005088 case 'N':
5089 message = "malformed \\N character escape";
5090 if (ucnhash_CAPI == NULL) {
5091 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005092 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5093 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005094 if (ucnhash_CAPI == NULL)
5095 goto ucnhashError;
5096 }
5097 if (*s == '{') {
5098 const char *start = s+1;
5099 /* look for the closing brace */
5100 while (*s != '}' && s < end)
5101 s++;
5102 if (s > start && s < end && *s == '}') {
5103 /* found a name. look it up in the unicode database */
5104 message = "unknown Unicode character name";
5105 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005106 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5107 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005108 goto store;
5109 }
5110 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005111 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005112 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005114 errors, &errorHandler,
5115 "unicodeescape", message,
5116 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005117 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005118 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005119 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005120 break;
5121
5122 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005123 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005124 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125 message = "\\ at end of string";
5126 s--;
5127 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005128 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005130 errors, &errorHandler,
5131 "unicodeescape", message,
5132 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005133 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005134 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005135 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005136 }
5137 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005138 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5139 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005140 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005141 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005143 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005144 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005146 /* Ensure the length prediction worked in case of ASCII strings */
5147 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5148
5149 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5150 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005151 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005152 Py_XDECREF(errorHandler);
5153 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005155
Benjamin Peterson29060642009-01-31 22:14:21 +00005156 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005157 PyErr_SetString(
5158 PyExc_UnicodeError,
5159 "\\N escapes not supported (can't load unicodedata module)"
5160 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005161 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005162 Py_XDECREF(errorHandler);
5163 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005164 return NULL;
5165
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005168 Py_XDECREF(errorHandler);
5169 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 return NULL;
5171}
5172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005173#undef WRITE_ASCII_OR_WSTR
5174#undef WRITE_WSTR
5175
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176/* Return a Unicode-Escape string version of the Unicode object.
5177
5178 If quotes is true, the string is enclosed in u"" or u'' quotes as
5179 appropriate.
5180
5181*/
5182
Walter Dörwald79e913e2007-05-12 11:08:06 +00005183static const char *hexdigits = "0123456789abcdef";
5184
Alexander Belopolsky40018472011-02-26 01:02:56 +00005185PyObject *
5186PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005187 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005189 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005192#ifdef Py_UNICODE_WIDE
5193 const Py_ssize_t expandsize = 10;
5194#else
5195 const Py_ssize_t expandsize = 6;
5196#endif
5197
Thomas Wouters89f507f2006-12-13 04:49:30 +00005198 /* XXX(nnorwitz): rather than over-allocating, it would be
5199 better to choose a different scheme. Perhaps scan the
5200 first N-chars of the string and allocate based on that size.
5201 */
5202 /* Initial allocation is based on the longest-possible unichr
5203 escape.
5204
5205 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5206 unichr, so in this case it's the longest unichr escape. In
5207 narrow (UTF-16) builds this is five chars per source unichr
5208 since there are two unichrs in the surrogate pair, so in narrow
5209 (UTF-16) builds it's not the longest unichr escape.
5210
5211 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5212 so in the narrow (UTF-16) build case it's the longest unichr
5213 escape.
5214 */
5215
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005216 if (size == 0)
5217 return PyBytes_FromStringAndSize(NULL, 0);
5218
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005219 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005221
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005222 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 2
5224 + expandsize*size
5225 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 if (repr == NULL)
5227 return NULL;
5228
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005229 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 while (size-- > 0) {
5232 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005233
Walter Dörwald79e913e2007-05-12 11:08:06 +00005234 /* Escape backslashes */
5235 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 *p++ = '\\';
5237 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005238 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005239 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005240
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005241#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005242 /* Map 21-bit characters to '\U00xxxxxx' */
5243 else if (ch >= 0x10000) {
5244 *p++ = '\\';
5245 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005246 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5247 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5248 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5249 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5250 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5251 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5252 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5253 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005254 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005255 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005256#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5258 else if (ch >= 0xD800 && ch < 0xDC00) {
5259 Py_UNICODE ch2;
5260 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005261
Benjamin Peterson29060642009-01-31 22:14:21 +00005262 ch2 = *s++;
5263 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005264 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5266 *p++ = '\\';
5267 *p++ = 'U';
5268 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5269 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5270 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5271 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5272 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5273 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5274 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5275 *p++ = hexdigits[ucs & 0x0000000F];
5276 continue;
5277 }
5278 /* Fall through: isolated surrogates are copied as-is */
5279 s--;
5280 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005281 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005282#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005283
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005285 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 *p++ = '\\';
5287 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005288 *p++ = hexdigits[(ch >> 12) & 0x000F];
5289 *p++ = hexdigits[(ch >> 8) & 0x000F];
5290 *p++ = hexdigits[(ch >> 4) & 0x000F];
5291 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005293
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005294 /* Map special whitespace to '\t', \n', '\r' */
5295 else if (ch == '\t') {
5296 *p++ = '\\';
5297 *p++ = 't';
5298 }
5299 else if (ch == '\n') {
5300 *p++ = '\\';
5301 *p++ = 'n';
5302 }
5303 else if (ch == '\r') {
5304 *p++ = '\\';
5305 *p++ = 'r';
5306 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005307
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005308 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005309 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005311 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005312 *p++ = hexdigits[(ch >> 4) & 0x000F];
5313 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005314 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005315
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 /* Copy everything else as-is */
5317 else
5318 *p++ = (char) ch;
5319 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005321 assert(p - PyBytes_AS_STRING(repr) > 0);
5322 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5323 return NULL;
5324 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325}
5326
Alexander Belopolsky40018472011-02-26 01:02:56 +00005327PyObject *
5328PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005330 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 if (!PyUnicode_Check(unicode)) {
5332 PyErr_BadArgument();
5333 return NULL;
5334 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005335 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5336 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005337 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338}
5339
5340/* --- Raw Unicode Escape Codec ------------------------------------------- */
5341
Alexander Belopolsky40018472011-02-26 01:02:56 +00005342PyObject *
5343PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005344 Py_ssize_t size,
5345 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005348 Py_ssize_t startinpos;
5349 Py_ssize_t endinpos;
5350 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005352 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 const char *end;
5354 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355 PyObject *errorHandler = NULL;
5356 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005357
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 /* Escaped strings will always be longer than the resulting
5359 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 length after conversion to the true value. (But decoding error
5361 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 v = _PyUnicode_New(size);
5363 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 end = s + size;
5369 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 unsigned char c;
5371 Py_UCS4 x;
5372 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005373 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 /* Non-escape characters are interpreted as Unicode ordinals */
5376 if (*s != '\\') {
5377 *p++ = (unsigned char)*s++;
5378 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005379 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005380 startinpos = s-starts;
5381
5382 /* \u-escapes are only interpreted iff the number of leading
5383 backslashes if odd */
5384 bs = s;
5385 for (;s < end;) {
5386 if (*s != '\\')
5387 break;
5388 *p++ = (unsigned char)*s++;
5389 }
5390 if (((s - bs) & 1) == 0 ||
5391 s >= end ||
5392 (*s != 'u' && *s != 'U')) {
5393 continue;
5394 }
5395 p--;
5396 count = *s=='u' ? 4 : 8;
5397 s++;
5398
5399 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5400 outpos = p-PyUnicode_AS_UNICODE(v);
5401 for (x = 0, i = 0; i < count; ++i, ++s) {
5402 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005403 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 endinpos = s-starts;
5405 if (unicode_decode_call_errorhandler(
5406 errors, &errorHandler,
5407 "rawunicodeescape", "truncated \\uXXXX",
5408 &starts, &end, &startinpos, &endinpos, &exc, &s,
5409 &v, &outpos, &p))
5410 goto onError;
5411 goto nextByte;
5412 }
5413 x = (x<<4) & ~0xF;
5414 if (c >= '0' && c <= '9')
5415 x += c - '0';
5416 else if (c >= 'a' && c <= 'f')
5417 x += 10 + c - 'a';
5418 else
5419 x += 10 + c - 'A';
5420 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005421 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 /* UCS-2 character */
5423 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005424 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 /* UCS-4 character. Either store directly, or as
5426 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005427#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005429#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005430 x -= 0x10000L;
5431 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5432 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005433#endif
5434 } else {
5435 endinpos = s-starts;
5436 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005437 if (unicode_decode_call_errorhandler(
5438 errors, &errorHandler,
5439 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 &starts, &end, &startinpos, &endinpos, &exc, &s,
5441 &v, &outpos, &p))
5442 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005443 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 nextByte:
5445 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005447 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005449 Py_XDECREF(errorHandler);
5450 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005451 if (PyUnicode_READY(v) == -1) {
5452 Py_DECREF(v);
5453 return NULL;
5454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005456
Benjamin Peterson29060642009-01-31 22:14:21 +00005457 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459 Py_XDECREF(errorHandler);
5460 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 return NULL;
5462}
5463
Alexander Belopolsky40018472011-02-26 01:02:56 +00005464PyObject *
5465PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005466 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005468 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 char *p;
5470 char *q;
5471
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005472#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005473 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005474#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005475 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005476#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005477
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005478 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005480
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005481 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 if (repr == NULL)
5483 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005484 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005485 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005487 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 while (size-- > 0) {
5489 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005490#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 /* Map 32-bit characters to '\Uxxxxxxxx' */
5492 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005493 *p++ = '\\';
5494 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005495 *p++ = hexdigits[(ch >> 28) & 0xf];
5496 *p++ = hexdigits[(ch >> 24) & 0xf];
5497 *p++ = hexdigits[(ch >> 20) & 0xf];
5498 *p++ = hexdigits[(ch >> 16) & 0xf];
5499 *p++ = hexdigits[(ch >> 12) & 0xf];
5500 *p++ = hexdigits[(ch >> 8) & 0xf];
5501 *p++ = hexdigits[(ch >> 4) & 0xf];
5502 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005503 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005504 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005505#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5507 if (ch >= 0xD800 && ch < 0xDC00) {
5508 Py_UNICODE ch2;
5509 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005510
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 ch2 = *s++;
5512 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005513 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5515 *p++ = '\\';
5516 *p++ = 'U';
5517 *p++ = hexdigits[(ucs >> 28) & 0xf];
5518 *p++ = hexdigits[(ucs >> 24) & 0xf];
5519 *p++ = hexdigits[(ucs >> 20) & 0xf];
5520 *p++ = hexdigits[(ucs >> 16) & 0xf];
5521 *p++ = hexdigits[(ucs >> 12) & 0xf];
5522 *p++ = hexdigits[(ucs >> 8) & 0xf];
5523 *p++ = hexdigits[(ucs >> 4) & 0xf];
5524 *p++ = hexdigits[ucs & 0xf];
5525 continue;
5526 }
5527 /* Fall through: isolated surrogates are copied as-is */
5528 s--;
5529 size++;
5530 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005531#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005532 /* Map 16-bit characters to '\uxxxx' */
5533 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 *p++ = '\\';
5535 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005536 *p++ = hexdigits[(ch >> 12) & 0xf];
5537 *p++ = hexdigits[(ch >> 8) & 0xf];
5538 *p++ = hexdigits[(ch >> 4) & 0xf];
5539 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 /* Copy everything else as-is */
5542 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 *p++ = (char) ch;
5544 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005545 size = p - q;
5546
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005547 assert(size > 0);
5548 if (_PyBytes_Resize(&repr, size) < 0)
5549 return NULL;
5550 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551}
5552
Alexander Belopolsky40018472011-02-26 01:02:56 +00005553PyObject *
5554PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005556 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005558 PyErr_BadArgument();
5559 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005561 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5562 PyUnicode_GET_SIZE(unicode));
5563
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005564 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565}
5566
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005567/* --- Unicode Internal Codec ------------------------------------------- */
5568
Alexander Belopolsky40018472011-02-26 01:02:56 +00005569PyObject *
5570_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005571 Py_ssize_t size,
5572 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005573{
5574 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005575 Py_ssize_t startinpos;
5576 Py_ssize_t endinpos;
5577 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005578 PyUnicodeObject *v;
5579 Py_UNICODE *p;
5580 const char *end;
5581 const char *reason;
5582 PyObject *errorHandler = NULL;
5583 PyObject *exc = NULL;
5584
Neal Norwitzd43069c2006-01-08 01:12:10 +00005585#ifdef Py_UNICODE_WIDE
5586 Py_UNICODE unimax = PyUnicode_GetMax();
5587#endif
5588
Thomas Wouters89f507f2006-12-13 04:49:30 +00005589 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005590 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5591 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005593 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5594 as string was created with the old API. */
5595 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005597 p = PyUnicode_AS_UNICODE(v);
5598 end = s + size;
5599
5600 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005601 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005602 /* We have to sanity check the raw data, otherwise doom looms for
5603 some malformed UCS-4 data. */
5604 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005605#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005606 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005607#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005608 end-s < Py_UNICODE_SIZE
5609 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005611 startinpos = s - starts;
5612 if (end-s < Py_UNICODE_SIZE) {
5613 endinpos = end-starts;
5614 reason = "truncated input";
5615 }
5616 else {
5617 endinpos = s - starts + Py_UNICODE_SIZE;
5618 reason = "illegal code point (> 0x10FFFF)";
5619 }
5620 outpos = p - PyUnicode_AS_UNICODE(v);
5621 if (unicode_decode_call_errorhandler(
5622 errors, &errorHandler,
5623 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005624 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005625 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005626 goto onError;
5627 }
5628 }
5629 else {
5630 p++;
5631 s += Py_UNICODE_SIZE;
5632 }
5633 }
5634
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005635 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005636 goto onError;
5637 Py_XDECREF(errorHandler);
5638 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005639 if (PyUnicode_READY(v) == -1) {
5640 Py_DECREF(v);
5641 return NULL;
5642 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005643 return (PyObject *)v;
5644
Benjamin Peterson29060642009-01-31 22:14:21 +00005645 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005646 Py_XDECREF(v);
5647 Py_XDECREF(errorHandler);
5648 Py_XDECREF(exc);
5649 return NULL;
5650}
5651
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652/* --- Latin-1 Codec ------------------------------------------------------ */
5653
Alexander Belopolsky40018472011-02-26 01:02:56 +00005654PyObject *
5655PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005656 Py_ssize_t size,
5657 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005660 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661}
5662
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005663/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005664static void
5665make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005666 const char *encoding,
5667 const Py_UNICODE *unicode, Py_ssize_t size,
5668 Py_ssize_t startpos, Py_ssize_t endpos,
5669 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 *exceptionObject = PyUnicodeEncodeError_Create(
5673 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 }
5675 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5677 goto onError;
5678 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5679 goto onError;
5680 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5681 goto onError;
5682 return;
5683 onError:
5684 Py_DECREF(*exceptionObject);
5685 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 }
5687}
5688
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005690static void
5691raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005692 const char *encoding,
5693 const Py_UNICODE *unicode, Py_ssize_t size,
5694 Py_ssize_t startpos, Py_ssize_t endpos,
5695 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005696{
5697 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701}
5702
5703/* error handling callback helper:
5704 build arguments, call the callback and check the arguments,
5705 put the result into newpos and return the replacement string, which
5706 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005707static PyObject *
5708unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005709 PyObject **errorHandler,
5710 const char *encoding, const char *reason,
5711 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5712 Py_ssize_t startpos, Py_ssize_t endpos,
5713 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005714{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005715 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716
5717 PyObject *restuple;
5718 PyObject *resunicode;
5719
5720 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005722 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 }
5725
5726 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005730
5731 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005732 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005733 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005736 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 Py_DECREF(restuple);
5738 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005740 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 &resunicode, newpos)) {
5742 Py_DECREF(restuple);
5743 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005745 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5746 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5747 Py_DECREF(restuple);
5748 return NULL;
5749 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005750 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005752 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005753 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5754 Py_DECREF(restuple);
5755 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005757 Py_INCREF(resunicode);
5758 Py_DECREF(restuple);
5759 return resunicode;
5760}
5761
Alexander Belopolsky40018472011-02-26 01:02:56 +00005762static PyObject *
5763unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005764 Py_ssize_t size,
5765 const char *errors,
5766 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767{
5768 /* output object */
5769 PyObject *res;
5770 /* pointers to the beginning and end+1 of input */
5771 const Py_UNICODE *startp = p;
5772 const Py_UNICODE *endp = p + size;
5773 /* pointer to the beginning of the unencodable characters */
5774 /* const Py_UNICODE *badp = NULL; */
5775 /* pointer into the output */
5776 char *str;
5777 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005778 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005779 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5780 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 PyObject *errorHandler = NULL;
5782 PyObject *exc = NULL;
5783 /* the following variable is used for caching string comparisons
5784 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5785 int known_errorHandler = -1;
5786
5787 /* allocate enough for a simple encoding without
5788 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005789 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005790 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005791 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005793 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005794 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 ressize = size;
5796
5797 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005799
Benjamin Peterson29060642009-01-31 22:14:21 +00005800 /* can we encode this? */
5801 if (c<limit) {
5802 /* no overflow check, because we know that the space is enough */
5803 *str++ = (char)c;
5804 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005805 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 else {
5807 Py_ssize_t unicodepos = p-startp;
5808 Py_ssize_t requiredsize;
5809 PyObject *repunicode;
5810 Py_ssize_t repsize;
5811 Py_ssize_t newpos;
5812 Py_ssize_t respos;
5813 Py_UNICODE *uni2;
5814 /* startpos for collecting unencodable chars */
5815 const Py_UNICODE *collstart = p;
5816 const Py_UNICODE *collend = p;
5817 /* find all unecodable characters */
5818 while ((collend < endp) && ((*collend)>=limit))
5819 ++collend;
5820 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5821 if (known_errorHandler==-1) {
5822 if ((errors==NULL) || (!strcmp(errors, "strict")))
5823 known_errorHandler = 1;
5824 else if (!strcmp(errors, "replace"))
5825 known_errorHandler = 2;
5826 else if (!strcmp(errors, "ignore"))
5827 known_errorHandler = 3;
5828 else if (!strcmp(errors, "xmlcharrefreplace"))
5829 known_errorHandler = 4;
5830 else
5831 known_errorHandler = 0;
5832 }
5833 switch (known_errorHandler) {
5834 case 1: /* strict */
5835 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5836 goto onError;
5837 case 2: /* replace */
5838 while (collstart++<collend)
5839 *str++ = '?'; /* fall through */
5840 case 3: /* ignore */
5841 p = collend;
5842 break;
5843 case 4: /* xmlcharrefreplace */
5844 respos = str - PyBytes_AS_STRING(res);
5845 /* determine replacement size (temporarily (mis)uses p) */
5846 for (p = collstart, repsize = 0; p < collend; ++p) {
5847 if (*p<10)
5848 repsize += 2+1+1;
5849 else if (*p<100)
5850 repsize += 2+2+1;
5851 else if (*p<1000)
5852 repsize += 2+3+1;
5853 else if (*p<10000)
5854 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005855#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005856 else
5857 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005858#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 else if (*p<100000)
5860 repsize += 2+5+1;
5861 else if (*p<1000000)
5862 repsize += 2+6+1;
5863 else
5864 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005865#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 }
5867 requiredsize = respos+repsize+(endp-collend);
5868 if (requiredsize > ressize) {
5869 if (requiredsize<2*ressize)
5870 requiredsize = 2*ressize;
5871 if (_PyBytes_Resize(&res, requiredsize))
5872 goto onError;
5873 str = PyBytes_AS_STRING(res) + respos;
5874 ressize = requiredsize;
5875 }
5876 /* generate replacement (temporarily (mis)uses p) */
5877 for (p = collstart; p < collend; ++p) {
5878 str += sprintf(str, "&#%d;", (int)*p);
5879 }
5880 p = collend;
5881 break;
5882 default:
5883 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5884 encoding, reason, startp, size, &exc,
5885 collstart-startp, collend-startp, &newpos);
5886 if (repunicode == NULL)
5887 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005888 if (PyBytes_Check(repunicode)) {
5889 /* Directly copy bytes result to output. */
5890 repsize = PyBytes_Size(repunicode);
5891 if (repsize > 1) {
5892 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005893 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005894 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5895 Py_DECREF(repunicode);
5896 goto onError;
5897 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005898 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005899 ressize += repsize-1;
5900 }
5901 memcpy(str, PyBytes_AsString(repunicode), repsize);
5902 str += repsize;
5903 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005904 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005905 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005906 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 /* need more space? (at least enough for what we
5908 have+the replacement+the rest of the string, so
5909 we won't have to check space for encodable characters) */
5910 respos = str - PyBytes_AS_STRING(res);
5911 repsize = PyUnicode_GET_SIZE(repunicode);
5912 requiredsize = respos+repsize+(endp-collend);
5913 if (requiredsize > ressize) {
5914 if (requiredsize<2*ressize)
5915 requiredsize = 2*ressize;
5916 if (_PyBytes_Resize(&res, requiredsize)) {
5917 Py_DECREF(repunicode);
5918 goto onError;
5919 }
5920 str = PyBytes_AS_STRING(res) + respos;
5921 ressize = requiredsize;
5922 }
5923 /* check if there is anything unencodable in the replacement
5924 and copy it to the output */
5925 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5926 c = *uni2;
5927 if (c >= limit) {
5928 raise_encode_exception(&exc, encoding, startp, size,
5929 unicodepos, unicodepos+1, reason);
5930 Py_DECREF(repunicode);
5931 goto onError;
5932 }
5933 *str = (char)c;
5934 }
5935 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005936 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005937 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005938 }
5939 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005940 /* Resize if we allocated to much */
5941 size = str - PyBytes_AS_STRING(res);
5942 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005943 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005944 if (_PyBytes_Resize(&res, size) < 0)
5945 goto onError;
5946 }
5947
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005948 Py_XDECREF(errorHandler);
5949 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005950 return res;
5951
5952 onError:
5953 Py_XDECREF(res);
5954 Py_XDECREF(errorHandler);
5955 Py_XDECREF(exc);
5956 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005957}
5958
Alexander Belopolsky40018472011-02-26 01:02:56 +00005959PyObject *
5960PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005961 Py_ssize_t size,
5962 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005964 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965}
5966
Alexander Belopolsky40018472011-02-26 01:02:56 +00005967PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005968_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969{
5970 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 PyErr_BadArgument();
5972 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005974 if (PyUnicode_READY(unicode) == -1)
5975 return NULL;
5976 /* Fast path: if it is a one-byte string, construct
5977 bytes object directly. */
5978 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
5979 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
5980 PyUnicode_GET_LENGTH(unicode));
5981 /* Non-Latin-1 characters present. Defer to above function to
5982 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005985 errors);
5986}
5987
5988PyObject*
5989PyUnicode_AsLatin1String(PyObject *unicode)
5990{
5991 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992}
5993
5994/* --- 7-bit ASCII Codec -------------------------------------------------- */
5995
Alexander Belopolsky40018472011-02-26 01:02:56 +00005996PyObject *
5997PyUnicode_DecodeASCII(const char *s,
5998 Py_ssize_t size,
5999 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006001 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 PyUnicodeObject *v;
6003 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006004 Py_ssize_t startinpos;
6005 Py_ssize_t endinpos;
6006 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006008 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006009 PyObject *errorHandler = NULL;
6010 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006011 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006012
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006014 if (size == 1 && *(unsigned char*)s < 128)
6015 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6016
6017 /* Fast path. Assume the input actually *is* ASCII, and allocate
6018 a single-block Unicode object with that assumption. If there is
6019 an error, drop the object and start over. */
6020 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6021 if (v == NULL)
6022 goto onError;
6023 d = PyUnicode_1BYTE_DATA(v);
6024 for (i = 0; i < size; i++) {
6025 unsigned char ch = ((unsigned char*)s)[i];
6026 if (ch < 128)
6027 d[i] = ch;
6028 else
6029 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006031 if (i == size)
6032 return (PyObject*)v;
6033 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006034
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 v = _PyUnicode_New(size);
6036 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006041 e = s + size;
6042 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 register unsigned char c = (unsigned char)*s;
6044 if (c < 128) {
6045 *p++ = c;
6046 ++s;
6047 }
6048 else {
6049 startinpos = s-starts;
6050 endinpos = startinpos + 1;
6051 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6052 if (unicode_decode_call_errorhandler(
6053 errors, &errorHandler,
6054 "ascii", "ordinal not in range(128)",
6055 &starts, &e, &startinpos, &endinpos, &exc, &s,
6056 &v, &outpos, &p))
6057 goto onError;
6058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006060 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6062 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006063 Py_XDECREF(errorHandler);
6064 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006065 if (PyUnicode_READY(v) == -1) {
6066 Py_DECREF(v);
6067 return NULL;
6068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006070
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006073 Py_XDECREF(errorHandler);
6074 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 return NULL;
6076}
6077
Alexander Belopolsky40018472011-02-26 01:02:56 +00006078PyObject *
6079PyUnicode_EncodeASCII(const Py_UNICODE *p,
6080 Py_ssize_t size,
6081 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006083 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084}
6085
Alexander Belopolsky40018472011-02-26 01:02:56 +00006086PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006087_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088{
6089 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 PyErr_BadArgument();
6091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006093 if (PyUnicode_READY(unicode) == -1)
6094 return NULL;
6095 /* Fast path: if it is an ASCII-only string, construct bytes object
6096 directly. Else defer to above function to raise the exception. */
6097 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6098 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6099 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006102 errors);
6103}
6104
6105PyObject *
6106PyUnicode_AsASCIIString(PyObject *unicode)
6107{
6108 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109}
6110
Victor Stinner99b95382011-07-04 14:23:54 +02006111#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006112
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006113/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006114
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006115#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006116#define NEED_RETRY
6117#endif
6118
6119/* XXX This code is limited to "true" double-byte encodings, as
6120 a) it assumes an incomplete character consists of a single byte, and
6121 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006123
Alexander Belopolsky40018472011-02-26 01:02:56 +00006124static int
6125is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006126{
6127 const char *curr = s + offset;
6128
6129 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 const char *prev = CharPrev(s, curr);
6131 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006132 }
6133 return 0;
6134}
6135
6136/*
6137 * Decode MBCS string into unicode object. If 'final' is set, converts
6138 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6139 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006140static int
6141decode_mbcs(PyUnicodeObject **v,
6142 const char *s, /* MBCS string */
6143 int size, /* sizeof MBCS string */
6144 int final,
6145 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006146{
6147 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006148 Py_ssize_t n;
6149 DWORD usize;
6150 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006151
6152 assert(size >= 0);
6153
Victor Stinner554f3f02010-06-16 23:33:54 +00006154 /* check and handle 'errors' arg */
6155 if (errors==NULL || strcmp(errors, "strict")==0)
6156 flags = MB_ERR_INVALID_CHARS;
6157 else if (strcmp(errors, "ignore")==0)
6158 flags = 0;
6159 else {
6160 PyErr_Format(PyExc_ValueError,
6161 "mbcs encoding does not support errors='%s'",
6162 errors);
6163 return -1;
6164 }
6165
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006166 /* Skip trailing lead-byte unless 'final' is set */
6167 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006169
6170 /* First get the size of the result */
6171 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006172 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6173 if (usize==0)
6174 goto mbcs_decode_error;
6175 } else
6176 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006177
6178 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 /* Create unicode object */
6180 *v = _PyUnicode_New(usize);
6181 if (*v == NULL)
6182 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006183 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006184 }
6185 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 /* Extend unicode object */
6187 n = PyUnicode_GET_SIZE(*v);
6188 if (_PyUnicode_Resize(v, n + usize) < 0)
6189 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006190 }
6191
6192 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006193 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006195 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6196 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006198 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006199 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006200
6201mbcs_decode_error:
6202 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6203 we raise a UnicodeDecodeError - else it is a 'generic'
6204 windows error
6205 */
6206 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6207 /* Ideally, we should get reason from FormatMessage - this
6208 is the Windows 2000 English version of the message
6209 */
6210 PyObject *exc = NULL;
6211 const char *reason = "No mapping for the Unicode character exists "
6212 "in the target multi-byte code page.";
6213 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6214 if (exc != NULL) {
6215 PyCodec_StrictErrors(exc);
6216 Py_DECREF(exc);
6217 }
6218 } else {
6219 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6220 }
6221 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006222}
6223
Alexander Belopolsky40018472011-02-26 01:02:56 +00006224PyObject *
6225PyUnicode_DecodeMBCSStateful(const char *s,
6226 Py_ssize_t size,
6227 const char *errors,
6228 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006229{
6230 PyUnicodeObject *v = NULL;
6231 int done;
6232
6233 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006235
6236#ifdef NEED_RETRY
6237 retry:
6238 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006239 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006240 else
6241#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006242 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006243
6244 if (done < 0) {
6245 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006247 }
6248
6249 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006251
6252#ifdef NEED_RETRY
6253 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 s += done;
6255 size -= done;
6256 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006257 }
6258#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006259 if (PyUnicode_READY(v) == -1) {
6260 Py_DECREF(v);
6261 return NULL;
6262 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006263 return (PyObject *)v;
6264}
6265
Alexander Belopolsky40018472011-02-26 01:02:56 +00006266PyObject *
6267PyUnicode_DecodeMBCS(const char *s,
6268 Py_ssize_t size,
6269 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006270{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006271 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6272}
6273
6274/*
6275 * Convert unicode into string object (MBCS).
6276 * Returns 0 if succeed, -1 otherwise.
6277 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006278static int
6279encode_mbcs(PyObject **repr,
6280 const Py_UNICODE *p, /* unicode */
6281 int size, /* size of unicode */
6282 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006283{
Victor Stinner554f3f02010-06-16 23:33:54 +00006284 BOOL usedDefaultChar = FALSE;
6285 BOOL *pusedDefaultChar;
6286 int mbcssize;
6287 Py_ssize_t n;
6288 PyObject *exc = NULL;
6289 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006290
6291 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006292
Victor Stinner554f3f02010-06-16 23:33:54 +00006293 /* check and handle 'errors' arg */
6294 if (errors==NULL || strcmp(errors, "strict")==0) {
6295 flags = WC_NO_BEST_FIT_CHARS;
6296 pusedDefaultChar = &usedDefaultChar;
6297 } else if (strcmp(errors, "replace")==0) {
6298 flags = 0;
6299 pusedDefaultChar = NULL;
6300 } else {
6301 PyErr_Format(PyExc_ValueError,
6302 "mbcs encoding does not support errors='%s'",
6303 errors);
6304 return -1;
6305 }
6306
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006307 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006308 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006309 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6310 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 if (mbcssize == 0) {
6312 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6313 return -1;
6314 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006315 /* If we used a default char, then we failed! */
6316 if (pusedDefaultChar && *pusedDefaultChar)
6317 goto mbcs_encode_error;
6318 } else {
6319 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006320 }
6321
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006322 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 /* Create string object */
6324 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6325 if (*repr == NULL)
6326 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006327 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006328 }
6329 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 /* Extend string object */
6331 n = PyBytes_Size(*repr);
6332 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6333 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006334 }
6335
6336 /* Do the conversion */
6337 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006339 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6340 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6342 return -1;
6343 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006344 if (pusedDefaultChar && *pusedDefaultChar)
6345 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006346 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006347 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006348
6349mbcs_encode_error:
6350 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6351 Py_XDECREF(exc);
6352 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006353}
6354
Alexander Belopolsky40018472011-02-26 01:02:56 +00006355PyObject *
6356PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6357 Py_ssize_t size,
6358 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006359{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006360 PyObject *repr = NULL;
6361 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006362
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006363#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006365 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006366 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006367 else
6368#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006369 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006370
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006371 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 Py_XDECREF(repr);
6373 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006374 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006375
6376#ifdef NEED_RETRY
6377 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 p += INT_MAX;
6379 size -= INT_MAX;
6380 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006381 }
6382#endif
6383
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006384 return repr;
6385}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006386
Alexander Belopolsky40018472011-02-26 01:02:56 +00006387PyObject *
6388PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006389{
6390 if (!PyUnicode_Check(unicode)) {
6391 PyErr_BadArgument();
6392 return NULL;
6393 }
6394 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 PyUnicode_GET_SIZE(unicode),
6396 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006397}
6398
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006399#undef NEED_RETRY
6400
Victor Stinner99b95382011-07-04 14:23:54 +02006401#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006402
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403/* --- Character Mapping Codec -------------------------------------------- */
6404
Alexander Belopolsky40018472011-02-26 01:02:56 +00006405PyObject *
6406PyUnicode_DecodeCharmap(const char *s,
6407 Py_ssize_t size,
6408 PyObject *mapping,
6409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006412 Py_ssize_t startinpos;
6413 Py_ssize_t endinpos;
6414 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 PyUnicodeObject *v;
6417 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006418 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 PyObject *errorHandler = NULL;
6420 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006421 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006422 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006423
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 /* Default to Latin-1 */
6425 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427
6428 v = _PyUnicode_New(size);
6429 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006435 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 mapstring = PyUnicode_AS_UNICODE(mapping);
6437 maplen = PyUnicode_GET_SIZE(mapping);
6438 while (s < e) {
6439 unsigned char ch = *s;
6440 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 if (ch < maplen)
6443 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 if (x == 0xfffe) {
6446 /* undefined mapping */
6447 outpos = p-PyUnicode_AS_UNICODE(v);
6448 startinpos = s-starts;
6449 endinpos = startinpos+1;
6450 if (unicode_decode_call_errorhandler(
6451 errors, &errorHandler,
6452 "charmap", "character maps to <undefined>",
6453 &starts, &e, &startinpos, &endinpos, &exc, &s,
6454 &v, &outpos, &p)) {
6455 goto onError;
6456 }
6457 continue;
6458 }
6459 *p++ = x;
6460 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006461 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006462 }
6463 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 while (s < e) {
6465 unsigned char ch = *s;
6466 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006467
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6469 w = PyLong_FromLong((long)ch);
6470 if (w == NULL)
6471 goto onError;
6472 x = PyObject_GetItem(mapping, w);
6473 Py_DECREF(w);
6474 if (x == NULL) {
6475 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6476 /* No mapping found means: mapping is undefined. */
6477 PyErr_Clear();
6478 x = Py_None;
6479 Py_INCREF(x);
6480 } else
6481 goto onError;
6482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006483
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 /* Apply mapping */
6485 if (PyLong_Check(x)) {
6486 long value = PyLong_AS_LONG(x);
6487 if (value < 0 || value > 65535) {
6488 PyErr_SetString(PyExc_TypeError,
6489 "character mapping must be in range(65536)");
6490 Py_DECREF(x);
6491 goto onError;
6492 }
6493 *p++ = (Py_UNICODE)value;
6494 }
6495 else if (x == Py_None) {
6496 /* undefined mapping */
6497 outpos = p-PyUnicode_AS_UNICODE(v);
6498 startinpos = s-starts;
6499 endinpos = startinpos+1;
6500 if (unicode_decode_call_errorhandler(
6501 errors, &errorHandler,
6502 "charmap", "character maps to <undefined>",
6503 &starts, &e, &startinpos, &endinpos, &exc, &s,
6504 &v, &outpos, &p)) {
6505 Py_DECREF(x);
6506 goto onError;
6507 }
6508 Py_DECREF(x);
6509 continue;
6510 }
6511 else if (PyUnicode_Check(x)) {
6512 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006513
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 if (targetsize == 1)
6515 /* 1-1 mapping */
6516 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006517
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 else if (targetsize > 1) {
6519 /* 1-n mapping */
6520 if (targetsize > extrachars) {
6521 /* resize first */
6522 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6523 Py_ssize_t needed = (targetsize - extrachars) + \
6524 (targetsize << 2);
6525 extrachars += needed;
6526 /* XXX overflow detection missing */
6527 if (_PyUnicode_Resize(&v,
6528 PyUnicode_GET_SIZE(v) + needed) < 0) {
6529 Py_DECREF(x);
6530 goto onError;
6531 }
6532 p = PyUnicode_AS_UNICODE(v) + oldpos;
6533 }
6534 Py_UNICODE_COPY(p,
6535 PyUnicode_AS_UNICODE(x),
6536 targetsize);
6537 p += targetsize;
6538 extrachars -= targetsize;
6539 }
6540 /* 1-0 mapping: skip the character */
6541 }
6542 else {
6543 /* wrong return value */
6544 PyErr_SetString(PyExc_TypeError,
6545 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006546 Py_DECREF(x);
6547 goto onError;
6548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 Py_DECREF(x);
6550 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 }
6553 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6555 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006556 Py_XDECREF(errorHandler);
6557 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006558 if (PyUnicode_READY(v) == -1) {
6559 Py_DECREF(v);
6560 return NULL;
6561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006563
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006565 Py_XDECREF(errorHandler);
6566 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 Py_XDECREF(v);
6568 return NULL;
6569}
6570
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006571/* Charmap encoding: the lookup table */
6572
Alexander Belopolsky40018472011-02-26 01:02:56 +00006573struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 PyObject_HEAD
6575 unsigned char level1[32];
6576 int count2, count3;
6577 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006578};
6579
6580static PyObject*
6581encoding_map_size(PyObject *obj, PyObject* args)
6582{
6583 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006584 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006586}
6587
6588static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006589 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 PyDoc_STR("Return the size (in bytes) of this object") },
6591 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006592};
6593
6594static void
6595encoding_map_dealloc(PyObject* o)
6596{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006597 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006598}
6599
6600static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006601 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 "EncodingMap", /*tp_name*/
6603 sizeof(struct encoding_map), /*tp_basicsize*/
6604 0, /*tp_itemsize*/
6605 /* methods */
6606 encoding_map_dealloc, /*tp_dealloc*/
6607 0, /*tp_print*/
6608 0, /*tp_getattr*/
6609 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006610 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 0, /*tp_repr*/
6612 0, /*tp_as_number*/
6613 0, /*tp_as_sequence*/
6614 0, /*tp_as_mapping*/
6615 0, /*tp_hash*/
6616 0, /*tp_call*/
6617 0, /*tp_str*/
6618 0, /*tp_getattro*/
6619 0, /*tp_setattro*/
6620 0, /*tp_as_buffer*/
6621 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6622 0, /*tp_doc*/
6623 0, /*tp_traverse*/
6624 0, /*tp_clear*/
6625 0, /*tp_richcompare*/
6626 0, /*tp_weaklistoffset*/
6627 0, /*tp_iter*/
6628 0, /*tp_iternext*/
6629 encoding_map_methods, /*tp_methods*/
6630 0, /*tp_members*/
6631 0, /*tp_getset*/
6632 0, /*tp_base*/
6633 0, /*tp_dict*/
6634 0, /*tp_descr_get*/
6635 0, /*tp_descr_set*/
6636 0, /*tp_dictoffset*/
6637 0, /*tp_init*/
6638 0, /*tp_alloc*/
6639 0, /*tp_new*/
6640 0, /*tp_free*/
6641 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006642};
6643
6644PyObject*
6645PyUnicode_BuildEncodingMap(PyObject* string)
6646{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006647 PyObject *result;
6648 struct encoding_map *mresult;
6649 int i;
6650 int need_dict = 0;
6651 unsigned char level1[32];
6652 unsigned char level2[512];
6653 unsigned char *mlevel1, *mlevel2, *mlevel3;
6654 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006655 int kind;
6656 void *data;
6657 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006659 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006660 PyErr_BadArgument();
6661 return NULL;
6662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006663 kind = PyUnicode_KIND(string);
6664 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006665 memset(level1, 0xFF, sizeof level1);
6666 memset(level2, 0xFF, sizeof level2);
6667
6668 /* If there isn't a one-to-one mapping of NULL to \0,
6669 or if there are non-BMP characters, we need to use
6670 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006671 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006672 need_dict = 1;
6673 for (i = 1; i < 256; i++) {
6674 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006675 ch = PyUnicode_READ(kind, data, i);
6676 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006677 need_dict = 1;
6678 break;
6679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006680 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006681 /* unmapped character */
6682 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006683 l1 = ch >> 11;
6684 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006685 if (level1[l1] == 0xFF)
6686 level1[l1] = count2++;
6687 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006688 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006689 }
6690
6691 if (count2 >= 0xFF || count3 >= 0xFF)
6692 need_dict = 1;
6693
6694 if (need_dict) {
6695 PyObject *result = PyDict_New();
6696 PyObject *key, *value;
6697 if (!result)
6698 return NULL;
6699 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006700 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006701 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006702 if (!key || !value)
6703 goto failed1;
6704 if (PyDict_SetItem(result, key, value) == -1)
6705 goto failed1;
6706 Py_DECREF(key);
6707 Py_DECREF(value);
6708 }
6709 return result;
6710 failed1:
6711 Py_XDECREF(key);
6712 Py_XDECREF(value);
6713 Py_DECREF(result);
6714 return NULL;
6715 }
6716
6717 /* Create a three-level trie */
6718 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6719 16*count2 + 128*count3 - 1);
6720 if (!result)
6721 return PyErr_NoMemory();
6722 PyObject_Init(result, &EncodingMapType);
6723 mresult = (struct encoding_map*)result;
6724 mresult->count2 = count2;
6725 mresult->count3 = count3;
6726 mlevel1 = mresult->level1;
6727 mlevel2 = mresult->level23;
6728 mlevel3 = mresult->level23 + 16*count2;
6729 memcpy(mlevel1, level1, 32);
6730 memset(mlevel2, 0xFF, 16*count2);
6731 memset(mlevel3, 0, 128*count3);
6732 count3 = 0;
6733 for (i = 1; i < 256; i++) {
6734 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006735 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006736 /* unmapped character */
6737 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006738 o1 = PyUnicode_READ(kind, data, i)>>11;
6739 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006740 i2 = 16*mlevel1[o1] + o2;
6741 if (mlevel2[i2] == 0xFF)
6742 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006743 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006744 i3 = 128*mlevel2[i2] + o3;
6745 mlevel3[i3] = i;
6746 }
6747 return result;
6748}
6749
6750static int
6751encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6752{
6753 struct encoding_map *map = (struct encoding_map*)mapping;
6754 int l1 = c>>11;
6755 int l2 = (c>>7) & 0xF;
6756 int l3 = c & 0x7F;
6757 int i;
6758
6759#ifdef Py_UNICODE_WIDE
6760 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006762 }
6763#endif
6764 if (c == 0)
6765 return 0;
6766 /* level 1*/
6767 i = map->level1[l1];
6768 if (i == 0xFF) {
6769 return -1;
6770 }
6771 /* level 2*/
6772 i = map->level23[16*i+l2];
6773 if (i == 0xFF) {
6774 return -1;
6775 }
6776 /* level 3 */
6777 i = map->level23[16*map->count2 + 128*i + l3];
6778 if (i == 0) {
6779 return -1;
6780 }
6781 return i;
6782}
6783
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006784/* Lookup the character ch in the mapping. If the character
6785 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006786 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006787static PyObject *
6788charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789{
Christian Heimes217cfd12007-12-02 14:31:20 +00006790 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006791 PyObject *x;
6792
6793 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006794 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006795 x = PyObject_GetItem(mapping, w);
6796 Py_DECREF(w);
6797 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6799 /* No mapping found means: mapping is undefined. */
6800 PyErr_Clear();
6801 x = Py_None;
6802 Py_INCREF(x);
6803 return x;
6804 } else
6805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006807 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006809 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 long value = PyLong_AS_LONG(x);
6811 if (value < 0 || value > 255) {
6812 PyErr_SetString(PyExc_TypeError,
6813 "character mapping must be in range(256)");
6814 Py_DECREF(x);
6815 return NULL;
6816 }
6817 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006819 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 /* wrong return value */
6823 PyErr_Format(PyExc_TypeError,
6824 "character mapping must return integer, bytes or None, not %.400s",
6825 x->ob_type->tp_name);
6826 Py_DECREF(x);
6827 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 }
6829}
6830
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006831static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006832charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006833{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006834 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6835 /* exponentially overallocate to minimize reallocations */
6836 if (requiredsize < 2*outsize)
6837 requiredsize = 2*outsize;
6838 if (_PyBytes_Resize(outobj, requiredsize))
6839 return -1;
6840 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006841}
6842
Benjamin Peterson14339b62009-01-31 16:36:08 +00006843typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006845} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006846/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006847 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006848 space is available. Return a new reference to the object that
6849 was put in the output buffer, or Py_None, if the mapping was undefined
6850 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006851 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006852static charmapencode_result
6853charmapencode_output(Py_UNICODE c, PyObject *mapping,
6854 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006855{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006856 PyObject *rep;
6857 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006858 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006859
Christian Heimes90aa7642007-12-19 02:45:37 +00006860 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006861 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006863 if (res == -1)
6864 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 if (outsize<requiredsize)
6866 if (charmapencode_resize(outobj, outpos, requiredsize))
6867 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006868 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 outstart[(*outpos)++] = (char)res;
6870 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006871 }
6872
6873 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006874 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006876 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 Py_DECREF(rep);
6878 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006879 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 if (PyLong_Check(rep)) {
6881 Py_ssize_t requiredsize = *outpos+1;
6882 if (outsize<requiredsize)
6883 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6884 Py_DECREF(rep);
6885 return enc_EXCEPTION;
6886 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006887 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006889 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 else {
6891 const char *repchars = PyBytes_AS_STRING(rep);
6892 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6893 Py_ssize_t requiredsize = *outpos+repsize;
6894 if (outsize<requiredsize)
6895 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6896 Py_DECREF(rep);
6897 return enc_EXCEPTION;
6898 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006899 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 memcpy(outstart + *outpos, repchars, repsize);
6901 *outpos += repsize;
6902 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006903 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006904 Py_DECREF(rep);
6905 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006906}
6907
6908/* handle an error in PyUnicode_EncodeCharmap
6909 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006910static int
6911charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006912 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006914 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006915 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006916{
6917 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006918 Py_ssize_t repsize;
6919 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006920 Py_UNICODE *uni2;
6921 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006922 Py_ssize_t collstartpos = *inpos;
6923 Py_ssize_t collendpos = *inpos+1;
6924 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006925 char *encoding = "charmap";
6926 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006927 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006928
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006929 /* find all unencodable characters */
6930 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006931 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006932 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 int res = encoding_map_lookup(p[collendpos], mapping);
6934 if (res != -1)
6935 break;
6936 ++collendpos;
6937 continue;
6938 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006939
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 rep = charmapencode_lookup(p[collendpos], mapping);
6941 if (rep==NULL)
6942 return -1;
6943 else if (rep!=Py_None) {
6944 Py_DECREF(rep);
6945 break;
6946 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006947 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006949 }
6950 /* cache callback name lookup
6951 * (if not done yet, i.e. it's the first error) */
6952 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 if ((errors==NULL) || (!strcmp(errors, "strict")))
6954 *known_errorHandler = 1;
6955 else if (!strcmp(errors, "replace"))
6956 *known_errorHandler = 2;
6957 else if (!strcmp(errors, "ignore"))
6958 *known_errorHandler = 3;
6959 else if (!strcmp(errors, "xmlcharrefreplace"))
6960 *known_errorHandler = 4;
6961 else
6962 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006963 }
6964 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006965 case 1: /* strict */
6966 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6967 return -1;
6968 case 2: /* replace */
6969 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 x = charmapencode_output('?', mapping, res, respos);
6971 if (x==enc_EXCEPTION) {
6972 return -1;
6973 }
6974 else if (x==enc_FAILED) {
6975 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6976 return -1;
6977 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006978 }
6979 /* fall through */
6980 case 3: /* ignore */
6981 *inpos = collendpos;
6982 break;
6983 case 4: /* xmlcharrefreplace */
6984 /* generate replacement (temporarily (mis)uses p) */
6985 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006986 char buffer[2+29+1+1];
6987 char *cp;
6988 sprintf(buffer, "&#%d;", (int)p[collpos]);
6989 for (cp = buffer; *cp; ++cp) {
6990 x = charmapencode_output(*cp, mapping, res, respos);
6991 if (x==enc_EXCEPTION)
6992 return -1;
6993 else if (x==enc_FAILED) {
6994 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6995 return -1;
6996 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006997 }
6998 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006999 *inpos = collendpos;
7000 break;
7001 default:
7002 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 encoding, reason, p, size, exceptionObject,
7004 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007005 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007007 if (PyBytes_Check(repunicode)) {
7008 /* Directly copy bytes result to output. */
7009 Py_ssize_t outsize = PyBytes_Size(*res);
7010 Py_ssize_t requiredsize;
7011 repsize = PyBytes_Size(repunicode);
7012 requiredsize = *respos + repsize;
7013 if (requiredsize > outsize)
7014 /* Make room for all additional bytes. */
7015 if (charmapencode_resize(res, respos, requiredsize)) {
7016 Py_DECREF(repunicode);
7017 return -1;
7018 }
7019 memcpy(PyBytes_AsString(*res) + *respos,
7020 PyBytes_AsString(repunicode), repsize);
7021 *respos += repsize;
7022 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007023 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007024 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007026 /* generate replacement */
7027 repsize = PyUnicode_GET_SIZE(repunicode);
7028 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 x = charmapencode_output(*uni2, mapping, res, respos);
7030 if (x==enc_EXCEPTION) {
7031 return -1;
7032 }
7033 else if (x==enc_FAILED) {
7034 Py_DECREF(repunicode);
7035 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7036 return -1;
7037 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007038 }
7039 *inpos = newpos;
7040 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007041 }
7042 return 0;
7043}
7044
Alexander Belopolsky40018472011-02-26 01:02:56 +00007045PyObject *
7046PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7047 Py_ssize_t size,
7048 PyObject *mapping,
7049 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007051 /* output object */
7052 PyObject *res = NULL;
7053 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007054 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007055 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007056 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007057 PyObject *errorHandler = NULL;
7058 PyObject *exc = NULL;
7059 /* the following variable is used for caching string comparisons
7060 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7061 * 3=ignore, 4=xmlcharrefreplace */
7062 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063
7064 /* Default to Latin-1 */
7065 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007068 /* allocate enough for a simple encoding without
7069 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007070 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007071 if (res == NULL)
7072 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007073 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007074 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007076 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 /* try to encode it */
7078 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7079 if (x==enc_EXCEPTION) /* error */
7080 goto onError;
7081 if (x==enc_FAILED) { /* unencodable character */
7082 if (charmap_encoding_error(p, size, &inpos, mapping,
7083 &exc,
7084 &known_errorHandler, &errorHandler, errors,
7085 &res, &respos)) {
7086 goto onError;
7087 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007088 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 else
7090 /* done with this character => adjust input position */
7091 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007094 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007095 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007096 if (_PyBytes_Resize(&res, respos) < 0)
7097 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007098
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007099 Py_XDECREF(exc);
7100 Py_XDECREF(errorHandler);
7101 return res;
7102
Benjamin Peterson29060642009-01-31 22:14:21 +00007103 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007104 Py_XDECREF(res);
7105 Py_XDECREF(exc);
7106 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 return NULL;
7108}
7109
Alexander Belopolsky40018472011-02-26 01:02:56 +00007110PyObject *
7111PyUnicode_AsCharmapString(PyObject *unicode,
7112 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113{
7114 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 PyErr_BadArgument();
7116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 }
7118 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007119 PyUnicode_GET_SIZE(unicode),
7120 mapping,
7121 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122}
7123
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007124/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007125static void
7126make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007127 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007128 Py_ssize_t startpos, Py_ssize_t endpos,
7129 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007131 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007132 *exceptionObject = _PyUnicodeTranslateError_Create(
7133 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 }
7135 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7137 goto onError;
7138 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7139 goto onError;
7140 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7141 goto onError;
7142 return;
7143 onError:
7144 Py_DECREF(*exceptionObject);
7145 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146 }
7147}
7148
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007149/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007150static void
7151raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007152 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007153 Py_ssize_t startpos, Py_ssize_t endpos,
7154 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007155{
7156 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007157 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007158 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007160}
7161
7162/* error handling callback helper:
7163 build arguments, call the callback and check the arguments,
7164 put the result into newpos and return the replacement string, which
7165 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007166static PyObject *
7167unicode_translate_call_errorhandler(const char *errors,
7168 PyObject **errorHandler,
7169 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007170 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007171 Py_ssize_t startpos, Py_ssize_t endpos,
7172 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007173{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007174 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007175
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007176 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007177 PyObject *restuple;
7178 PyObject *resunicode;
7179
7180 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007182 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007183 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007184 }
7185
7186 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007187 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007188 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007190
7191 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007193 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007194 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007195 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007196 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 Py_DECREF(restuple);
7198 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199 }
7200 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 &resunicode, &i_newpos)) {
7202 Py_DECREF(restuple);
7203 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007204 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007205 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007206 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007207 else
7208 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007209 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7211 Py_DECREF(restuple);
7212 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007213 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007214 Py_INCREF(resunicode);
7215 Py_DECREF(restuple);
7216 return resunicode;
7217}
7218
7219/* Lookup the character ch in the mapping and put the result in result,
7220 which must be decrefed by the caller.
7221 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007222static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007223charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007224{
Christian Heimes217cfd12007-12-02 14:31:20 +00007225 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007226 PyObject *x;
7227
7228 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007230 x = PyObject_GetItem(mapping, w);
7231 Py_DECREF(w);
7232 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7234 /* No mapping found means: use 1:1 mapping. */
7235 PyErr_Clear();
7236 *result = NULL;
7237 return 0;
7238 } else
7239 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007240 }
7241 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 *result = x;
7243 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007244 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007245 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 long value = PyLong_AS_LONG(x);
7247 long max = PyUnicode_GetMax();
7248 if (value < 0 || value > max) {
7249 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007250 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007251 Py_DECREF(x);
7252 return -1;
7253 }
7254 *result = x;
7255 return 0;
7256 }
7257 else if (PyUnicode_Check(x)) {
7258 *result = x;
7259 return 0;
7260 }
7261 else {
7262 /* wrong return value */
7263 PyErr_SetString(PyExc_TypeError,
7264 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007265 Py_DECREF(x);
7266 return -1;
7267 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007268}
7269/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 if not reallocate and adjust various state variables.
7271 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007272static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007273charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007274 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007275{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007276 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007277 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007278 /* exponentially overallocate to minimize reallocations */
7279 if (requiredsize < 2 * oldsize)
7280 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007281 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7282 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007283 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007284 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007285 }
7286 return 0;
7287}
7288/* lookup the character, put the result in the output string and adjust
7289 various state variables. Return a new reference to the object that
7290 was put in the output buffer in *result, or Py_None, if the mapping was
7291 undefined (in which case no character was written).
7292 The called must decref result.
7293 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007294static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007295charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7296 PyObject *mapping, Py_UCS4 **output,
7297 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007298 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007299{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007300 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7301 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007302 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007303 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007305 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007306 }
7307 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007308 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007309 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007310 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007311 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007312 }
7313 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007314 Py_ssize_t repsize;
7315 if (PyUnicode_READY(*res) == -1)
7316 return -1;
7317 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 if (repsize==1) {
7319 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007320 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 }
7322 else if (repsize!=0) {
7323 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007324 Py_ssize_t requiredsize = *opos +
7325 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007326 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007327 Py_ssize_t i;
7328 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007329 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007330 for(i = 0; i < repsize; i++)
7331 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007333 }
7334 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007336 return 0;
7337}
7338
Alexander Belopolsky40018472011-02-26 01:02:56 +00007339PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007340_PyUnicode_TranslateCharmap(PyObject *input,
7341 PyObject *mapping,
7342 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007344 /* input object */
7345 char *idata;
7346 Py_ssize_t size, i;
7347 int kind;
7348 /* output buffer */
7349 Py_UCS4 *output = NULL;
7350 Py_ssize_t osize;
7351 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007352 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007353 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007354 char *reason = "character maps to <undefined>";
7355 PyObject *errorHandler = NULL;
7356 PyObject *exc = NULL;
7357 /* the following variable is used for caching string comparisons
7358 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7359 * 3=ignore, 4=xmlcharrefreplace */
7360 int known_errorHandler = -1;
7361
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 PyErr_BadArgument();
7364 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007367 if (PyUnicode_READY(input) == -1)
7368 return NULL;
7369 idata = (char*)PyUnicode_DATA(input);
7370 kind = PyUnicode_KIND(input);
7371 size = PyUnicode_GET_LENGTH(input);
7372 i = 0;
7373
7374 if (size == 0) {
7375 Py_INCREF(input);
7376 return input;
7377 }
7378
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007379 /* allocate enough for a simple 1:1 translation without
7380 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007381 osize = size;
7382 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7383 opos = 0;
7384 if (output == NULL) {
7385 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007387 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007389 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 /* try to encode it */
7391 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007392 if (charmaptranslate_output(input, i, mapping,
7393 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 Py_XDECREF(x);
7395 goto onError;
7396 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007397 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007399 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 else { /* untranslatable character */
7401 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7402 Py_ssize_t repsize;
7403 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007404 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007406 Py_ssize_t collstart = i;
7407 Py_ssize_t collend = i+1;
7408 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007411 while (collend < size) {
7412 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 goto onError;
7414 Py_XDECREF(x);
7415 if (x!=Py_None)
7416 break;
7417 ++collend;
7418 }
7419 /* cache callback name lookup
7420 * (if not done yet, i.e. it's the first error) */
7421 if (known_errorHandler==-1) {
7422 if ((errors==NULL) || (!strcmp(errors, "strict")))
7423 known_errorHandler = 1;
7424 else if (!strcmp(errors, "replace"))
7425 known_errorHandler = 2;
7426 else if (!strcmp(errors, "ignore"))
7427 known_errorHandler = 3;
7428 else if (!strcmp(errors, "xmlcharrefreplace"))
7429 known_errorHandler = 4;
7430 else
7431 known_errorHandler = 0;
7432 }
7433 switch (known_errorHandler) {
7434 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007435 raise_translate_exception(&exc, input, collstart,
7436 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007437 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 case 2: /* replace */
7439 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007440 for (coll = collstart; coll<collend; coll++)
7441 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 /* fall through */
7443 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007444 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 break;
7446 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007447 /* generate replacement (temporarily (mis)uses i) */
7448 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 char buffer[2+29+1+1];
7450 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007451 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7452 if (charmaptranslate_makespace(&output, &osize,
7453 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 goto onError;
7455 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007456 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007458 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 break;
7460 default:
7461 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007462 reason, input, &exc,
7463 collstart, collend, &newpos);
7464 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 goto onError;
7466 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007467 repsize = PyUnicode_GET_LENGTH(repunicode);
7468 if (charmaptranslate_makespace(&output, &osize,
7469 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 Py_DECREF(repunicode);
7471 goto onError;
7472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007473 for (uni2 = 0; repsize-->0; ++uni2)
7474 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7475 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007478 }
7479 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007480 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7481 if (!res)
7482 goto onError;
7483 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007484 Py_XDECREF(exc);
7485 Py_XDECREF(errorHandler);
7486 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007489 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007490 Py_XDECREF(exc);
7491 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492 return NULL;
7493}
7494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007495/* Deprecated. Use PyUnicode_Translate instead. */
7496PyObject *
7497PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7498 Py_ssize_t size,
7499 PyObject *mapping,
7500 const char *errors)
7501{
7502 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7503 if (!unicode)
7504 return NULL;
7505 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7506}
7507
Alexander Belopolsky40018472011-02-26 01:02:56 +00007508PyObject *
7509PyUnicode_Translate(PyObject *str,
7510 PyObject *mapping,
7511 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512{
7513 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007514
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 str = PyUnicode_FromObject(str);
7516 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007518 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519 Py_DECREF(str);
7520 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007521
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523 Py_XDECREF(str);
7524 return NULL;
7525}
Tim Petersced69f82003-09-16 20:30:58 +00007526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007527static Py_UCS4
7528fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7529{
7530 /* No need to call PyUnicode_READY(self) because this function is only
7531 called as a callback from fixup() which does it already. */
7532 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7533 const int kind = PyUnicode_KIND(self);
7534 void *data = PyUnicode_DATA(self);
7535 Py_UCS4 maxchar = 0, ch, fixed;
7536 Py_ssize_t i;
7537
7538 for (i = 0; i < len; ++i) {
7539 ch = PyUnicode_READ(kind, data, i);
7540 fixed = 0;
7541 if (ch > 127) {
7542 if (Py_UNICODE_ISSPACE(ch))
7543 fixed = ' ';
7544 else {
7545 const int decimal = Py_UNICODE_TODECIMAL(ch);
7546 if (decimal >= 0)
7547 fixed = '0' + decimal;
7548 }
7549 if (fixed != 0) {
7550 if (fixed > maxchar)
7551 maxchar = fixed;
7552 PyUnicode_WRITE(kind, data, i, fixed);
7553 }
7554 else if (ch > maxchar)
7555 maxchar = ch;
7556 }
7557 else if (ch > maxchar)
7558 maxchar = ch;
7559 }
7560
7561 return maxchar;
7562}
7563
7564PyObject *
7565_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7566{
7567 if (!PyUnicode_Check(unicode)) {
7568 PyErr_BadInternalCall();
7569 return NULL;
7570 }
7571 if (PyUnicode_READY(unicode) == -1)
7572 return NULL;
7573 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7574 /* If the string is already ASCII, just return the same string */
7575 Py_INCREF(unicode);
7576 return unicode;
7577 }
7578 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7579}
7580
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007581PyObject *
7582PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7583 Py_ssize_t length)
7584{
7585 PyObject *result;
7586 Py_UNICODE *p; /* write pointer into result */
7587 Py_ssize_t i;
7588 /* Copy to a new string */
7589 result = (PyObject *)_PyUnicode_New(length);
7590 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7591 if (result == NULL)
7592 return result;
7593 p = PyUnicode_AS_UNICODE(result);
7594 /* Iterate over code points */
7595 for (i = 0; i < length; i++) {
7596 Py_UNICODE ch =s[i];
7597 if (ch > 127) {
7598 int decimal = Py_UNICODE_TODECIMAL(ch);
7599 if (decimal >= 0)
7600 p[i] = '0' + decimal;
7601 }
7602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007603 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7604 Py_DECREF(result);
7605 return NULL;
7606 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007607 return result;
7608}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007609/* --- Decimal Encoder ---------------------------------------------------- */
7610
Alexander Belopolsky40018472011-02-26 01:02:56 +00007611int
7612PyUnicode_EncodeDecimal(Py_UNICODE *s,
7613 Py_ssize_t length,
7614 char *output,
7615 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007616{
7617 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007618 PyObject *errorHandler = NULL;
7619 PyObject *exc = NULL;
7620 const char *encoding = "decimal";
7621 const char *reason = "invalid decimal Unicode string";
7622 /* the following variable is used for caching string comparisons
7623 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7624 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007625
7626 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 PyErr_BadArgument();
7628 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007629 }
7630
7631 p = s;
7632 end = s + length;
7633 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 register Py_UNICODE ch = *p;
7635 int decimal;
7636 PyObject *repunicode;
7637 Py_ssize_t repsize;
7638 Py_ssize_t newpos;
7639 Py_UNICODE *uni2;
7640 Py_UNICODE *collstart;
7641 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007642
Benjamin Peterson29060642009-01-31 22:14:21 +00007643 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007644 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 ++p;
7646 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007647 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 decimal = Py_UNICODE_TODECIMAL(ch);
7649 if (decimal >= 0) {
7650 *output++ = '0' + decimal;
7651 ++p;
7652 continue;
7653 }
7654 if (0 < ch && ch < 256) {
7655 *output++ = (char)ch;
7656 ++p;
7657 continue;
7658 }
7659 /* All other characters are considered unencodable */
7660 collstart = p;
7661 collend = p+1;
7662 while (collend < end) {
7663 if ((0 < *collend && *collend < 256) ||
7664 !Py_UNICODE_ISSPACE(*collend) ||
7665 Py_UNICODE_TODECIMAL(*collend))
7666 break;
7667 }
7668 /* cache callback name lookup
7669 * (if not done yet, i.e. it's the first error) */
7670 if (known_errorHandler==-1) {
7671 if ((errors==NULL) || (!strcmp(errors, "strict")))
7672 known_errorHandler = 1;
7673 else if (!strcmp(errors, "replace"))
7674 known_errorHandler = 2;
7675 else if (!strcmp(errors, "ignore"))
7676 known_errorHandler = 3;
7677 else if (!strcmp(errors, "xmlcharrefreplace"))
7678 known_errorHandler = 4;
7679 else
7680 known_errorHandler = 0;
7681 }
7682 switch (known_errorHandler) {
7683 case 1: /* strict */
7684 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7685 goto onError;
7686 case 2: /* replace */
7687 for (p = collstart; p < collend; ++p)
7688 *output++ = '?';
7689 /* fall through */
7690 case 3: /* ignore */
7691 p = collend;
7692 break;
7693 case 4: /* xmlcharrefreplace */
7694 /* generate replacement (temporarily (mis)uses p) */
7695 for (p = collstart; p < collend; ++p)
7696 output += sprintf(output, "&#%d;", (int)*p);
7697 p = collend;
7698 break;
7699 default:
7700 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7701 encoding, reason, s, length, &exc,
7702 collstart-s, collend-s, &newpos);
7703 if (repunicode == NULL)
7704 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007705 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007706 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007707 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7708 Py_DECREF(repunicode);
7709 goto onError;
7710 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007711 /* generate replacement */
7712 repsize = PyUnicode_GET_SIZE(repunicode);
7713 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7714 Py_UNICODE ch = *uni2;
7715 if (Py_UNICODE_ISSPACE(ch))
7716 *output++ = ' ';
7717 else {
7718 decimal = Py_UNICODE_TODECIMAL(ch);
7719 if (decimal >= 0)
7720 *output++ = '0' + decimal;
7721 else if (0 < ch && ch < 256)
7722 *output++ = (char)ch;
7723 else {
7724 Py_DECREF(repunicode);
7725 raise_encode_exception(&exc, encoding,
7726 s, length, collstart-s, collend-s, reason);
7727 goto onError;
7728 }
7729 }
7730 }
7731 p = s + newpos;
7732 Py_DECREF(repunicode);
7733 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007734 }
7735 /* 0-terminate the output string */
7736 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007737 Py_XDECREF(exc);
7738 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007739 return 0;
7740
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007742 Py_XDECREF(exc);
7743 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007744 return -1;
7745}
7746
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747/* --- Helpers ------------------------------------------------------------ */
7748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007749#include "stringlib/ucs1lib.h"
7750#include "stringlib/fastsearch.h"
7751#include "stringlib/partition.h"
7752#include "stringlib/split.h"
7753#include "stringlib/count.h"
7754#include "stringlib/find.h"
7755#include "stringlib/localeutil.h"
7756#include "stringlib/undef.h"
7757
7758#include "stringlib/ucs2lib.h"
7759#include "stringlib/fastsearch.h"
7760#include "stringlib/partition.h"
7761#include "stringlib/split.h"
7762#include "stringlib/count.h"
7763#include "stringlib/find.h"
7764#include "stringlib/localeutil.h"
7765#include "stringlib/undef.h"
7766
7767#include "stringlib/ucs4lib.h"
7768#include "stringlib/fastsearch.h"
7769#include "stringlib/partition.h"
7770#include "stringlib/split.h"
7771#include "stringlib/count.h"
7772#include "stringlib/find.h"
7773#include "stringlib/localeutil.h"
7774#include "stringlib/undef.h"
7775
7776static Py_ssize_t
7777any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7778 const Py_UCS1*, Py_ssize_t,
7779 Py_ssize_t, Py_ssize_t),
7780 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7781 const Py_UCS2*, Py_ssize_t,
7782 Py_ssize_t, Py_ssize_t),
7783 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7784 const Py_UCS4*, Py_ssize_t,
7785 Py_ssize_t, Py_ssize_t),
7786 PyObject* s1, PyObject* s2,
7787 Py_ssize_t start,
7788 Py_ssize_t end)
7789{
7790 int kind1, kind2, kind;
7791 void *buf1, *buf2;
7792 Py_ssize_t len1, len2, result;
7793
7794 kind1 = PyUnicode_KIND(s1);
7795 kind2 = PyUnicode_KIND(s2);
7796 kind = kind1 > kind2 ? kind1 : kind2;
7797 buf1 = PyUnicode_DATA(s1);
7798 buf2 = PyUnicode_DATA(s2);
7799 if (kind1 != kind)
7800 buf1 = _PyUnicode_AsKind(s1, kind);
7801 if (!buf1)
7802 return -2;
7803 if (kind2 != kind)
7804 buf2 = _PyUnicode_AsKind(s2, kind);
7805 if (!buf2) {
7806 if (kind1 != kind) PyMem_Free(buf1);
7807 return -2;
7808 }
7809 len1 = PyUnicode_GET_LENGTH(s1);
7810 len2 = PyUnicode_GET_LENGTH(s2);
7811
7812 switch(kind) {
7813 case PyUnicode_1BYTE_KIND:
7814 result = ucs1(buf1, len1, buf2, len2, start, end);
7815 break;
7816 case PyUnicode_2BYTE_KIND:
7817 result = ucs2(buf1, len1, buf2, len2, start, end);
7818 break;
7819 case PyUnicode_4BYTE_KIND:
7820 result = ucs4(buf1, len1, buf2, len2, start, end);
7821 break;
7822 default:
7823 assert(0); result = -2;
7824 }
7825
7826 if (kind1 != kind)
7827 PyMem_Free(buf1);
7828 if (kind2 != kind)
7829 PyMem_Free(buf2);
7830
7831 return result;
7832}
7833
7834Py_ssize_t
7835_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7836 Py_ssize_t n_buffer,
7837 void *digits, Py_ssize_t n_digits,
7838 Py_ssize_t min_width,
7839 const char *grouping,
7840 const char *thousands_sep)
7841{
7842 switch(kind) {
7843 case PyUnicode_1BYTE_KIND:
7844 return _PyUnicode_ucs1_InsertThousandsGrouping(
7845 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7846 min_width, grouping, thousands_sep);
7847 case PyUnicode_2BYTE_KIND:
7848 return _PyUnicode_ucs2_InsertThousandsGrouping(
7849 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7850 min_width, grouping, thousands_sep);
7851 case PyUnicode_4BYTE_KIND:
7852 return _PyUnicode_ucs4_InsertThousandsGrouping(
7853 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7854 min_width, grouping, thousands_sep);
7855 }
7856 assert(0);
7857 return -1;
7858}
7859
7860
Eric Smith8c663262007-08-25 02:26:07 +00007861#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007862#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007863
Thomas Wouters477c8d52006-05-27 19:21:47 +00007864#include "stringlib/count.h"
7865#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007866
Thomas Wouters477c8d52006-05-27 19:21:47 +00007867/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007868#define ADJUST_INDICES(start, end, len) \
7869 if (end > len) \
7870 end = len; \
7871 else if (end < 0) { \
7872 end += len; \
7873 if (end < 0) \
7874 end = 0; \
7875 } \
7876 if (start < 0) { \
7877 start += len; \
7878 if (start < 0) \
7879 start = 0; \
7880 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007881
Alexander Belopolsky40018472011-02-26 01:02:56 +00007882Py_ssize_t
7883PyUnicode_Count(PyObject *str,
7884 PyObject *substr,
7885 Py_ssize_t start,
7886 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007888 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007889 PyUnicodeObject* str_obj;
7890 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007891 int kind1, kind2, kind;
7892 void *buf1 = NULL, *buf2 = NULL;
7893 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007894
Thomas Wouters477c8d52006-05-27 19:21:47 +00007895 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007896 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007898 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007899 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 Py_DECREF(str_obj);
7901 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902 }
Tim Petersced69f82003-09-16 20:30:58 +00007903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007904 kind1 = PyUnicode_KIND(str_obj);
7905 kind2 = PyUnicode_KIND(sub_obj);
7906 kind = kind1 > kind2 ? kind1 : kind2;
7907 buf1 = PyUnicode_DATA(str_obj);
7908 if (kind1 != kind)
7909 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7910 if (!buf1)
7911 goto onError;
7912 buf2 = PyUnicode_DATA(sub_obj);
7913 if (kind2 != kind)
7914 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7915 if (!buf2)
7916 goto onError;
7917 len1 = PyUnicode_GET_LENGTH(str_obj);
7918 len2 = PyUnicode_GET_LENGTH(sub_obj);
7919
7920 ADJUST_INDICES(start, end, len1);
7921 switch(kind) {
7922 case PyUnicode_1BYTE_KIND:
7923 result = ucs1lib_count(
7924 ((Py_UCS1*)buf1) + start, end - start,
7925 buf2, len2, PY_SSIZE_T_MAX
7926 );
7927 break;
7928 case PyUnicode_2BYTE_KIND:
7929 result = ucs2lib_count(
7930 ((Py_UCS2*)buf1) + start, end - start,
7931 buf2, len2, PY_SSIZE_T_MAX
7932 );
7933 break;
7934 case PyUnicode_4BYTE_KIND:
7935 result = ucs4lib_count(
7936 ((Py_UCS4*)buf1) + start, end - start,
7937 buf2, len2, PY_SSIZE_T_MAX
7938 );
7939 break;
7940 default:
7941 assert(0); result = 0;
7942 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007943
7944 Py_DECREF(sub_obj);
7945 Py_DECREF(str_obj);
7946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007947 if (kind1 != kind)
7948 PyMem_Free(buf1);
7949 if (kind2 != kind)
7950 PyMem_Free(buf2);
7951
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007953 onError:
7954 Py_DECREF(sub_obj);
7955 Py_DECREF(str_obj);
7956 if (kind1 != kind && buf1)
7957 PyMem_Free(buf1);
7958 if (kind2 != kind && buf2)
7959 PyMem_Free(buf2);
7960 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961}
7962
Alexander Belopolsky40018472011-02-26 01:02:56 +00007963Py_ssize_t
7964PyUnicode_Find(PyObject *str,
7965 PyObject *sub,
7966 Py_ssize_t start,
7967 Py_ssize_t end,
7968 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007970 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00007971
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007973 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007975 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007976 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 Py_DECREF(str);
7978 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 }
Tim Petersced69f82003-09-16 20:30:58 +00007980
Thomas Wouters477c8d52006-05-27 19:21:47 +00007981 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007982 result = any_find_slice(
7983 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
7984 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007985 );
7986 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007987 result = any_find_slice(
7988 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
7989 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007990 );
7991
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007993 Py_DECREF(sub);
7994
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 return result;
7996}
7997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007998Py_ssize_t
7999PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8000 Py_ssize_t start, Py_ssize_t end,
8001 int direction)
8002{
8003 char *result;
8004 int kind;
8005 if (PyUnicode_READY(str) == -1)
8006 return -2;
8007 if (end > PyUnicode_GET_LENGTH(str))
8008 end = PyUnicode_GET_LENGTH(str);
8009 kind = PyUnicode_KIND(str);
8010 result = findchar(PyUnicode_1BYTE_DATA(str)
8011 + PyUnicode_KIND_SIZE(kind, start),
8012 kind,
8013 end-start, ch, direction);
8014 if (!result)
8015 return -1;
8016 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8017}
8018
Alexander Belopolsky40018472011-02-26 01:02:56 +00008019static int
8020tailmatch(PyUnicodeObject *self,
8021 PyUnicodeObject *substring,
8022 Py_ssize_t start,
8023 Py_ssize_t end,
8024 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 int kind_self;
8027 int kind_sub;
8028 void *data_self;
8029 void *data_sub;
8030 Py_ssize_t offset;
8031 Py_ssize_t i;
8032 Py_ssize_t end_sub;
8033
8034 if (PyUnicode_READY(self) == -1 ||
8035 PyUnicode_READY(substring) == -1)
8036 return 0;
8037
8038 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 return 1;
8040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008041 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8042 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008046 kind_self = PyUnicode_KIND(self);
8047 data_self = PyUnicode_DATA(self);
8048 kind_sub = PyUnicode_KIND(substring);
8049 data_sub = PyUnicode_DATA(substring);
8050 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8051
8052 if (direction > 0)
8053 offset = end;
8054 else
8055 offset = start;
8056
8057 if (PyUnicode_READ(kind_self, data_self, offset) ==
8058 PyUnicode_READ(kind_sub, data_sub, 0) &&
8059 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8060 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8061 /* If both are of the same kind, memcmp is sufficient */
8062 if (kind_self == kind_sub) {
8063 return ! memcmp((char *)data_self +
8064 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8065 data_sub,
8066 PyUnicode_GET_LENGTH(substring) *
8067 PyUnicode_CHARACTER_SIZE(substring));
8068 }
8069 /* otherwise we have to compare each character by first accesing it */
8070 else {
8071 /* We do not need to compare 0 and len(substring)-1 because
8072 the if statement above ensured already that they are equal
8073 when we end up here. */
8074 // TODO: honor direction and do a forward or backwards search
8075 for (i = 1; i < end_sub; ++i) {
8076 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8077 PyUnicode_READ(kind_sub, data_sub, i))
8078 return 0;
8079 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008080 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082 }
8083
8084 return 0;
8085}
8086
Alexander Belopolsky40018472011-02-26 01:02:56 +00008087Py_ssize_t
8088PyUnicode_Tailmatch(PyObject *str,
8089 PyObject *substr,
8090 Py_ssize_t start,
8091 Py_ssize_t end,
8092 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008094 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008095
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096 str = PyUnicode_FromObject(str);
8097 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099 substr = PyUnicode_FromObject(substr);
8100 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008101 Py_DECREF(str);
8102 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 }
Tim Petersced69f82003-09-16 20:30:58 +00008104
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 (PyUnicodeObject *)substr,
8107 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108 Py_DECREF(str);
8109 Py_DECREF(substr);
8110 return result;
8111}
8112
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113/* Apply fixfct filter to the Unicode object self and return a
8114 reference to the modified object */
8115
Alexander Belopolsky40018472011-02-26 01:02:56 +00008116static PyObject *
8117fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008118 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008120 PyObject *u;
8121 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008123 if (PyUnicode_READY(self) == -1)
8124 return NULL;
8125 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8126 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8127 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8132 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008134 /* fix functions return the new maximum character in a string,
8135 if the kind of the resulting unicode object does not change,
8136 everything is fine. Otherwise we need to change the string kind
8137 and re-run the fix function. */
8138 maxchar_new = fixfct((PyUnicodeObject*)u);
8139 if (maxchar_new == 0)
8140 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8141 else if (maxchar_new <= 127)
8142 maxchar_new = 127;
8143 else if (maxchar_new <= 255)
8144 maxchar_new = 255;
8145 else if (maxchar_new <= 65535)
8146 maxchar_new = 65535;
8147 else
8148 maxchar_new = 1114111; /* 0x10ffff */
8149
8150 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 /* fixfct should return TRUE if it modified the buffer. If
8152 FALSE, return a reference to the original buffer instead
8153 (to save space, not time) */
8154 Py_INCREF(self);
8155 Py_DECREF(u);
8156 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 else if (maxchar_new == maxchar_old) {
8159 return u;
8160 }
8161 else {
8162 /* In case the maximum character changed, we need to
8163 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008164 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008165 if (v == NULL) {
8166 Py_DECREF(u);
8167 return NULL;
8168 }
8169 if (maxchar_new > maxchar_old) {
8170 /* If the maxchar increased so that the kind changed, not all
8171 characters are representable anymore and we need to fix the
8172 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008173 if (PyUnicode_CopyCharacters(v, 0,
8174 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008175 PyUnicode_GET_LENGTH(self)) < 0)
8176 {
8177 Py_DECREF(u);
8178 return NULL;
8179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 maxchar_old = fixfct((PyUnicodeObject*)v);
8181 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8182 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008183 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008184 if (PyUnicode_CopyCharacters(v, 0,
8185 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008186 PyUnicode_GET_LENGTH(self)) < 0)
8187 {
8188 Py_DECREF(u);
8189 return NULL;
8190 }
8191 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192
8193 Py_DECREF(u);
8194 return v;
8195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196}
8197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008198static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008199fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008201 /* No need to call PyUnicode_READY(self) because this function is only
8202 called as a callback from fixup() which does it already. */
8203 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8204 const int kind = PyUnicode_KIND(self);
8205 void *data = PyUnicode_DATA(self);
8206 int touched = 0;
8207 Py_UCS4 maxchar = 0;
8208 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008210 for (i = 0; i < len; ++i) {
8211 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8212 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8213 if (up != ch) {
8214 if (up > maxchar)
8215 maxchar = up;
8216 PyUnicode_WRITE(kind, data, i, up);
8217 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008219 else if (ch > maxchar)
8220 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221 }
8222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008223 if (touched)
8224 return maxchar;
8225 else
8226 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227}
8228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008229static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008230fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008232 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8233 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8234 const int kind = PyUnicode_KIND(self);
8235 void *data = PyUnicode_DATA(self);
8236 int touched = 0;
8237 Py_UCS4 maxchar = 0;
8238 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008240 for(i = 0; i < len; ++i) {
8241 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8242 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8243 if (lo != ch) {
8244 if (lo > maxchar)
8245 maxchar = lo;
8246 PyUnicode_WRITE(kind, data, i, lo);
8247 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008249 else if (ch > maxchar)
8250 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251 }
8252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008253 if (touched)
8254 return maxchar;
8255 else
8256 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257}
8258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008259static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008260fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008262 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8263 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8264 const int kind = PyUnicode_KIND(self);
8265 void *data = PyUnicode_DATA(self);
8266 int touched = 0;
8267 Py_UCS4 maxchar = 0;
8268 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008270 for(i = 0; i < len; ++i) {
8271 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8272 Py_UCS4 nu = 0;
8273
8274 if (Py_UNICODE_ISUPPER(ch))
8275 nu = Py_UNICODE_TOLOWER(ch);
8276 else if (Py_UNICODE_ISLOWER(ch))
8277 nu = Py_UNICODE_TOUPPER(ch);
8278
8279 if (nu != 0) {
8280 if (nu > maxchar)
8281 maxchar = nu;
8282 PyUnicode_WRITE(kind, data, i, nu);
8283 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008285 else if (ch > maxchar)
8286 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 }
8288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008289 if (touched)
8290 return maxchar;
8291 else
8292 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293}
8294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008296fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008298 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8299 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8300 const int kind = PyUnicode_KIND(self);
8301 void *data = PyUnicode_DATA(self);
8302 int touched = 0;
8303 Py_UCS4 maxchar = 0;
8304 Py_ssize_t i = 0;
8305 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008306
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008307 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008309
8310 ch = PyUnicode_READ(kind, data, i);
8311 if (!Py_UNICODE_ISUPPER(ch)) {
8312 maxchar = Py_UNICODE_TOUPPER(ch);
8313 PyUnicode_WRITE(kind, data, i, maxchar);
8314 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008316 ++i;
8317 for(; i < len; ++i) {
8318 ch = PyUnicode_READ(kind, data, i);
8319 if (!Py_UNICODE_ISLOWER(ch)) {
8320 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8321 if (lo > maxchar)
8322 maxchar = lo;
8323 PyUnicode_WRITE(kind, data, i, lo);
8324 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008326 else if (ch > maxchar)
8327 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329
8330 if (touched)
8331 return maxchar;
8332 else
8333 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334}
8335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008336static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008337fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008339 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8340 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8341 const int kind = PyUnicode_KIND(self);
8342 void *data = PyUnicode_DATA(self);
8343 Py_UCS4 maxchar = 0;
8344 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345 int previous_is_cased;
8346
8347 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348 if (len == 1) {
8349 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8350 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8351 if (ti != ch) {
8352 PyUnicode_WRITE(kind, data, i, ti);
8353 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 }
8355 else
8356 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008359 for(; i < len; ++i) {
8360 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8361 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008362
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 nu = Py_UNICODE_TOTITLE(ch);
8367
8368 if (nu > maxchar)
8369 maxchar = nu;
8370 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008371
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 if (Py_UNICODE_ISLOWER(ch) ||
8373 Py_UNICODE_ISUPPER(ch) ||
8374 Py_UNICODE_ISTITLE(ch))
8375 previous_is_cased = 1;
8376 else
8377 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380}
8381
Tim Peters8ce9f162004-08-27 01:49:32 +00008382PyObject *
8383PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008386 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008388 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008389 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8390 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008391 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008392 Py_ssize_t sz, i, res_offset;
8393 Py_UCS4 maxchar = 0;
8394 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395
Tim Peters05eba1f2004-08-27 21:32:02 +00008396 fseq = PySequence_Fast(seq, "");
8397 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008398 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008399 }
8400
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008401 /* NOTE: the following code can't call back into Python code,
8402 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008403 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008404
Tim Peters05eba1f2004-08-27 21:32:02 +00008405 seqlen = PySequence_Fast_GET_SIZE(fseq);
8406 /* If empty sequence, return u"". */
8407 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008408 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008409 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008410 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008411 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008412 /* If singleton sequence with an exact Unicode, return that. */
8413 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 item = items[0];
8415 if (PyUnicode_CheckExact(item)) {
8416 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 goto Done;
8419 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008420 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008421 else {
8422 /* Set up sep and seplen */
8423 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 /* fall back to a blank space separator */
8425 sep = PyUnicode_FromOrdinal(' ');
8426 if (!sep || PyUnicode_READY(sep) == -1)
8427 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008428 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008429 else {
8430 if (!PyUnicode_Check(separator)) {
8431 PyErr_Format(PyExc_TypeError,
8432 "separator: expected str instance,"
8433 " %.80s found",
8434 Py_TYPE(separator)->tp_name);
8435 goto onError;
8436 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437 if (PyUnicode_READY(separator) == -1)
8438 goto onError;
8439 sep = separator;
8440 seplen = PyUnicode_GET_LENGTH(separator);
8441 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8442 /* inc refcount to keep this code path symetric with the
8443 above case of a blank separator */
8444 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008445 }
8446 }
8447
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008448 /* There are at least two things to join, or else we have a subclass
8449 * of str in the sequence.
8450 * Do a pre-pass to figure out the total amount of space we'll
8451 * need (sz), and see whether all argument are strings.
8452 */
8453 sz = 0;
8454 for (i = 0; i < seqlen; i++) {
8455 const Py_ssize_t old_sz = sz;
8456 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 if (!PyUnicode_Check(item)) {
8458 PyErr_Format(PyExc_TypeError,
8459 "sequence item %zd: expected str instance,"
8460 " %.80s found",
8461 i, Py_TYPE(item)->tp_name);
8462 goto onError;
8463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 if (PyUnicode_READY(item) == -1)
8465 goto onError;
8466 sz += PyUnicode_GET_LENGTH(item);
8467 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8468 if (item_maxchar > maxchar)
8469 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008470 if (i != 0)
8471 sz += seplen;
8472 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8473 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008475 goto onError;
8476 }
8477 }
Tim Petersced69f82003-09-16 20:30:58 +00008478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008479 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008480 if (res == NULL)
8481 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008482
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008483 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008485 Py_ssize_t itemlen;
8486 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 /* Copy item, and maybe the separator. */
8489 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008490 if (PyUnicode_CopyCharacters(res, res_offset,
8491 sep, 0, seplen) < 0)
8492 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008495 if (PyUnicode_CopyCharacters(res, res_offset,
8496 item, 0, itemlen) < 0)
8497 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008500 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008501
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008503 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 Py_XDECREF(sep);
8505 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008508 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008510 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511 return NULL;
8512}
8513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514#define FILL(kind, data, value, start, length) \
8515 do { \
8516 Py_ssize_t i_ = 0; \
8517 assert(kind != PyUnicode_WCHAR_KIND); \
8518 switch ((kind)) { \
8519 case PyUnicode_1BYTE_KIND: { \
8520 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8521 memset(to_, (unsigned char)value, length); \
8522 break; \
8523 } \
8524 case PyUnicode_2BYTE_KIND: { \
8525 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8526 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8527 break; \
8528 } \
8529 default: { \
8530 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8531 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8532 break; \
8533 } \
8534 } \
8535 } while (0)
8536
Alexander Belopolsky40018472011-02-26 01:02:56 +00008537static PyUnicodeObject *
8538pad(PyUnicodeObject *self,
8539 Py_ssize_t left,
8540 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008543 PyObject *u;
8544 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008545 int kind;
8546 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547
8548 if (left < 0)
8549 left = 0;
8550 if (right < 0)
8551 right = 0;
8552
Tim Peters7a29bd52001-09-12 03:03:31 +00008553 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554 Py_INCREF(self);
8555 return self;
8556 }
8557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8559 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008560 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8561 return NULL;
8562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008563 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8564 if (fill > maxchar)
8565 maxchar = fill;
8566 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008567 if (!u)
8568 return NULL;
8569
8570 kind = PyUnicode_KIND(u);
8571 data = PyUnicode_DATA(u);
8572 if (left)
8573 FILL(kind, data, fill, 0, left);
8574 if (right)
8575 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008576 if (PyUnicode_CopyCharacters(u, left,
8577 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008578 _PyUnicode_LENGTH(self)) < 0)
8579 {
8580 Py_DECREF(u);
8581 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582 }
8583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587
Alexander Belopolsky40018472011-02-26 01:02:56 +00008588PyObject *
8589PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592
8593 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008597 switch(PyUnicode_KIND(string)) {
8598 case PyUnicode_1BYTE_KIND:
8599 list = ucs1lib_splitlines(
8600 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8601 PyUnicode_GET_LENGTH(string), keepends);
8602 break;
8603 case PyUnicode_2BYTE_KIND:
8604 list = ucs2lib_splitlines(
8605 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8606 PyUnicode_GET_LENGTH(string), keepends);
8607 break;
8608 case PyUnicode_4BYTE_KIND:
8609 list = ucs4lib_splitlines(
8610 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8611 PyUnicode_GET_LENGTH(string), keepends);
8612 break;
8613 default:
8614 assert(0);
8615 list = 0;
8616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617 Py_DECREF(string);
8618 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619}
8620
Alexander Belopolsky40018472011-02-26 01:02:56 +00008621static PyObject *
8622split(PyUnicodeObject *self,
8623 PyUnicodeObject *substring,
8624 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 int kind1, kind2, kind;
8627 void *buf1, *buf2;
8628 Py_ssize_t len1, len2;
8629 PyObject* out;
8630
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008632 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 if (PyUnicode_READY(self) == -1)
8635 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 if (substring == NULL)
8638 switch(PyUnicode_KIND(self)) {
8639 case PyUnicode_1BYTE_KIND:
8640 return ucs1lib_split_whitespace(
8641 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8642 PyUnicode_GET_LENGTH(self), maxcount
8643 );
8644 case PyUnicode_2BYTE_KIND:
8645 return ucs2lib_split_whitespace(
8646 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8647 PyUnicode_GET_LENGTH(self), maxcount
8648 );
8649 case PyUnicode_4BYTE_KIND:
8650 return ucs4lib_split_whitespace(
8651 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8652 PyUnicode_GET_LENGTH(self), maxcount
8653 );
8654 default:
8655 assert(0);
8656 return NULL;
8657 }
8658
8659 if (PyUnicode_READY(substring) == -1)
8660 return NULL;
8661
8662 kind1 = PyUnicode_KIND(self);
8663 kind2 = PyUnicode_KIND(substring);
8664 kind = kind1 > kind2 ? kind1 : kind2;
8665 buf1 = PyUnicode_DATA(self);
8666 buf2 = PyUnicode_DATA(substring);
8667 if (kind1 != kind)
8668 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8669 if (!buf1)
8670 return NULL;
8671 if (kind2 != kind)
8672 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8673 if (!buf2) {
8674 if (kind1 != kind) PyMem_Free(buf1);
8675 return NULL;
8676 }
8677 len1 = PyUnicode_GET_LENGTH(self);
8678 len2 = PyUnicode_GET_LENGTH(substring);
8679
8680 switch(kind) {
8681 case PyUnicode_1BYTE_KIND:
8682 out = ucs1lib_split(
8683 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8684 break;
8685 case PyUnicode_2BYTE_KIND:
8686 out = ucs2lib_split(
8687 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8688 break;
8689 case PyUnicode_4BYTE_KIND:
8690 out = ucs4lib_split(
8691 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8692 break;
8693 default:
8694 out = NULL;
8695 }
8696 if (kind1 != kind)
8697 PyMem_Free(buf1);
8698 if (kind2 != kind)
8699 PyMem_Free(buf2);
8700 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701}
8702
Alexander Belopolsky40018472011-02-26 01:02:56 +00008703static PyObject *
8704rsplit(PyUnicodeObject *self,
8705 PyUnicodeObject *substring,
8706 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008707{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708 int kind1, kind2, kind;
8709 void *buf1, *buf2;
8710 Py_ssize_t len1, len2;
8711 PyObject* out;
8712
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008713 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008714 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716 if (PyUnicode_READY(self) == -1)
8717 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719 if (substring == NULL)
8720 switch(PyUnicode_KIND(self)) {
8721 case PyUnicode_1BYTE_KIND:
8722 return ucs1lib_rsplit_whitespace(
8723 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8724 PyUnicode_GET_LENGTH(self), maxcount
8725 );
8726 case PyUnicode_2BYTE_KIND:
8727 return ucs2lib_rsplit_whitespace(
8728 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8729 PyUnicode_GET_LENGTH(self), maxcount
8730 );
8731 case PyUnicode_4BYTE_KIND:
8732 return ucs4lib_rsplit_whitespace(
8733 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8734 PyUnicode_GET_LENGTH(self), maxcount
8735 );
8736 default:
8737 assert(0);
8738 return NULL;
8739 }
8740
8741 if (PyUnicode_READY(substring) == -1)
8742 return NULL;
8743
8744 kind1 = PyUnicode_KIND(self);
8745 kind2 = PyUnicode_KIND(substring);
8746 kind = kind1 > kind2 ? kind1 : kind2;
8747 buf1 = PyUnicode_DATA(self);
8748 buf2 = PyUnicode_DATA(substring);
8749 if (kind1 != kind)
8750 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8751 if (!buf1)
8752 return NULL;
8753 if (kind2 != kind)
8754 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8755 if (!buf2) {
8756 if (kind1 != kind) PyMem_Free(buf1);
8757 return NULL;
8758 }
8759 len1 = PyUnicode_GET_LENGTH(self);
8760 len2 = PyUnicode_GET_LENGTH(substring);
8761
8762 switch(kind) {
8763 case PyUnicode_1BYTE_KIND:
8764 out = ucs1lib_rsplit(
8765 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8766 break;
8767 case PyUnicode_2BYTE_KIND:
8768 out = ucs2lib_rsplit(
8769 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8770 break;
8771 case PyUnicode_4BYTE_KIND:
8772 out = ucs4lib_rsplit(
8773 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8774 break;
8775 default:
8776 out = NULL;
8777 }
8778 if (kind1 != kind)
8779 PyMem_Free(buf1);
8780 if (kind2 != kind)
8781 PyMem_Free(buf2);
8782 return out;
8783}
8784
8785static Py_ssize_t
8786anylib_find(int kind, void *buf1, Py_ssize_t len1,
8787 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8788{
8789 switch(kind) {
8790 case PyUnicode_1BYTE_KIND:
8791 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8792 case PyUnicode_2BYTE_KIND:
8793 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8794 case PyUnicode_4BYTE_KIND:
8795 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8796 }
8797 assert(0);
8798 return -1;
8799}
8800
8801static Py_ssize_t
8802anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8803 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8804{
8805 switch(kind) {
8806 case PyUnicode_1BYTE_KIND:
8807 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8808 case PyUnicode_2BYTE_KIND:
8809 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8810 case PyUnicode_4BYTE_KIND:
8811 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8812 }
8813 assert(0);
8814 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008815}
8816
Alexander Belopolsky40018472011-02-26 01:02:56 +00008817static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008818replace(PyObject *self, PyObject *str1,
8819 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821 PyObject *u;
8822 char *sbuf = PyUnicode_DATA(self);
8823 char *buf1 = PyUnicode_DATA(str1);
8824 char *buf2 = PyUnicode_DATA(str2);
8825 int srelease = 0, release1 = 0, release2 = 0;
8826 int skind = PyUnicode_KIND(self);
8827 int kind1 = PyUnicode_KIND(str1);
8828 int kind2 = PyUnicode_KIND(str2);
8829 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8830 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8831 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832
8833 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008836 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838 if (skind < kind1)
8839 /* substring too wide to be present */
8840 goto nothing;
8841
8842 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008843 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008844 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008846 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008847 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008848 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 Py_UCS4 u1, u2, maxchar;
8850 int mayshrink, rkind;
8851 u1 = PyUnicode_READ_CHAR(str1, 0);
8852 if (!findchar(sbuf, PyUnicode_KIND(self),
8853 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008854 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 u2 = PyUnicode_READ_CHAR(str2, 0);
8856 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8857 /* Replacing u1 with u2 may cause a maxchar reduction in the
8858 result string. */
8859 mayshrink = maxchar > 127;
8860 if (u2 > maxchar) {
8861 maxchar = u2;
8862 mayshrink = 0;
8863 }
8864 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008865 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008867 if (PyUnicode_CopyCharacters(u, 0,
8868 (PyObject*)self, 0, slen) < 0)
8869 {
8870 Py_DECREF(u);
8871 return NULL;
8872 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 rkind = PyUnicode_KIND(u);
8874 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8875 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008876 if (--maxcount < 0)
8877 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008879 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008880 if (mayshrink) {
8881 PyObject *tmp = u;
8882 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8883 PyUnicode_GET_LENGTH(tmp));
8884 Py_DECREF(tmp);
8885 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008887 int rkind = skind;
8888 char *res;
8889 if (kind1 < rkind) {
8890 /* widen substring */
8891 buf1 = _PyUnicode_AsKind(str1, rkind);
8892 if (!buf1) goto error;
8893 release1 = 1;
8894 }
8895 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008896 if (i < 0)
8897 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 if (rkind > kind2) {
8899 /* widen replacement */
8900 buf2 = _PyUnicode_AsKind(str2, rkind);
8901 if (!buf2) goto error;
8902 release2 = 1;
8903 }
8904 else if (rkind < kind2) {
8905 /* widen self and buf1 */
8906 rkind = kind2;
8907 if (release1) PyMem_Free(buf1);
8908 sbuf = _PyUnicode_AsKind(self, rkind);
8909 if (!sbuf) goto error;
8910 srelease = 1;
8911 buf1 = _PyUnicode_AsKind(str1, rkind);
8912 if (!buf1) goto error;
8913 release1 = 1;
8914 }
8915 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8916 if (!res) {
8917 PyErr_NoMemory();
8918 goto error;
8919 }
8920 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008921 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8923 buf2,
8924 PyUnicode_KIND_SIZE(rkind, len2));
8925 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008926
8927 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8929 slen-i,
8930 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008931 if (i == -1)
8932 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8934 buf2,
8935 PyUnicode_KIND_SIZE(rkind, len2));
8936 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008937 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938
8939 u = PyUnicode_FromKindAndData(rkind, res, slen);
8940 PyMem_Free(res);
8941 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 Py_ssize_t n, i, j, ires;
8946 Py_ssize_t product, new_size;
8947 int rkind = skind;
8948 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 if (kind1 < rkind) {
8951 buf1 = _PyUnicode_AsKind(str1, rkind);
8952 if (!buf1) goto error;
8953 release1 = 1;
8954 }
8955 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008956 if (n == 0)
8957 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 if (kind2 < rkind) {
8959 buf2 = _PyUnicode_AsKind(str2, rkind);
8960 if (!buf2) goto error;
8961 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963 else if (kind2 > rkind) {
8964 rkind = kind2;
8965 sbuf = _PyUnicode_AsKind(self, rkind);
8966 if (!sbuf) goto error;
8967 srelease = 1;
8968 if (release1) PyMem_Free(buf1);
8969 buf1 = _PyUnicode_AsKind(str1, rkind);
8970 if (!buf1) goto error;
8971 release1 = 1;
8972 }
8973 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
8974 PyUnicode_GET_LENGTH(str1))); */
8975 product = n * (len2-len1);
8976 if ((product / (len2-len1)) != n) {
8977 PyErr_SetString(PyExc_OverflowError,
8978 "replace string is too long");
8979 goto error;
8980 }
8981 new_size = slen + product;
8982 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
8983 PyErr_SetString(PyExc_OverflowError,
8984 "replace string is too long");
8985 goto error;
8986 }
8987 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
8988 if (!res)
8989 goto error;
8990 ires = i = 0;
8991 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008992 while (n-- > 0) {
8993 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994 j = anylib_find(rkind,
8995 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8996 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008997 if (j == -1)
8998 break;
8999 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009000 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9002 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9003 PyUnicode_KIND_SIZE(rkind, j-i));
9004 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009005 }
9006 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007 if (len2 > 0) {
9008 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9009 buf2,
9010 PyUnicode_KIND_SIZE(rkind, len2));
9011 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009015 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009016 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9018 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9019 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009020 } else {
9021 /* interleave */
9022 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9024 buf2,
9025 PyUnicode_KIND_SIZE(rkind, len2));
9026 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009027 if (--n <= 0)
9028 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009029 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9030 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9031 PyUnicode_KIND_SIZE(rkind, 1));
9032 ires++;
9033 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9036 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9037 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041 if (srelease)
9042 PyMem_FREE(sbuf);
9043 if (release1)
9044 PyMem_FREE(buf1);
9045 if (release2)
9046 PyMem_FREE(buf2);
9047 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009048
Benjamin Peterson29060642009-01-31 22:14:21 +00009049 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009050 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 if (srelease)
9052 PyMem_FREE(sbuf);
9053 if (release1)
9054 PyMem_FREE(buf1);
9055 if (release2)
9056 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009057 if (PyUnicode_CheckExact(self)) {
9058 Py_INCREF(self);
9059 return (PyObject *) self;
9060 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 return PyUnicode_FromKindAndData(PyUnicode_KIND(self),
9062 PyUnicode_DATA(self),
9063 PyUnicode_GET_LENGTH(self));
9064 error:
9065 if (srelease && sbuf)
9066 PyMem_FREE(sbuf);
9067 if (release1 && buf1)
9068 PyMem_FREE(buf1);
9069 if (release2 && buf2)
9070 PyMem_FREE(buf2);
9071 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072}
9073
9074/* --- Unicode Object Methods --------------------------------------------- */
9075
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009076PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009077 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009078\n\
9079Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009080characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081
9082static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009083unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085 return fixup(self, fixtitle);
9086}
9087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009088PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090\n\
9091Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009092have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093
9094static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009095unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097 return fixup(self, fixcapitalize);
9098}
9099
9100#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009101PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103\n\
9104Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009105normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106
9107static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009108unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109{
9110 PyObject *list;
9111 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009112 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114 /* Split into words */
9115 list = split(self, NULL, -1);
9116 if (!list)
9117 return NULL;
9118
9119 /* Capitalize each word */
9120 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9121 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123 if (item == NULL)
9124 goto onError;
9125 Py_DECREF(PyList_GET_ITEM(list, i));
9126 PyList_SET_ITEM(list, i, item);
9127 }
9128
9129 /* Join the words to form a new string */
9130 item = PyUnicode_Join(NULL, list);
9131
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133 Py_DECREF(list);
9134 return (PyObject *)item;
9135}
9136#endif
9137
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009138/* Argument converter. Coerces to a single unicode character */
9139
9140static int
9141convert_uc(PyObject *obj, void *addr)
9142{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009143 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009144 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009145
Benjamin Peterson14339b62009-01-31 16:36:08 +00009146 uniobj = PyUnicode_FromObject(obj);
9147 if (uniobj == NULL) {
9148 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009149 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009150 return 0;
9151 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009153 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009155 Py_DECREF(uniobj);
9156 return 0;
9157 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 if (PyUnicode_READY(uniobj)) {
9159 Py_DECREF(uniobj);
9160 return 0;
9161 }
9162 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009163 Py_DECREF(uniobj);
9164 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009165}
9166
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009167PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009170Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009171done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172
9173static PyObject *
9174unicode_center(PyUnicodeObject *self, PyObject *args)
9175{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009176 Py_ssize_t marg, left;
9177 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 Py_UCS4 fillchar = ' ';
9179
9180 if (PyUnicode_READY(self) == -1)
9181 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182
Thomas Woutersde017742006-02-16 19:34:37 +00009183 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184 return NULL;
9185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187 Py_INCREF(self);
9188 return (PyObject*) self;
9189 }
9190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192 left = marg / 2 + (marg & width & 1);
9193
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009194 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195}
9196
Marc-André Lemburge5034372000-08-08 08:04:29 +00009197#if 0
9198
9199/* This code should go into some future Unicode collation support
9200 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009201 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009202
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009203/* speedy UTF-16 code point order comparison */
9204/* gleaned from: */
9205/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9206
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009207static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009208{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009209 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009210 0, 0, 0, 0, 0, 0, 0, 0,
9211 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009212 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009213};
9214
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215static int
9216unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9217{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009218 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009219
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220 Py_UNICODE *s1 = str1->str;
9221 Py_UNICODE *s2 = str2->str;
9222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 len1 = str1->_base._base.length;
9224 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009225
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009227 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009228
9229 c1 = *s1++;
9230 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009231
Benjamin Peterson29060642009-01-31 22:14:21 +00009232 if (c1 > (1<<11) * 26)
9233 c1 += utf16Fixup[c1>>11];
9234 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009235 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009236 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009237
9238 if (c1 != c2)
9239 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009240
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009241 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009242 }
9243
9244 return (len1 < len2) ? -1 : (len1 != len2);
9245}
9246
Marc-André Lemburge5034372000-08-08 08:04:29 +00009247#else
9248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249/* This function assumes that str1 and str2 are readied by the caller. */
9250
Marc-André Lemburge5034372000-08-08 08:04:29 +00009251static int
9252unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 int kind1, kind2;
9255 void *data1, *data2;
9256 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 kind1 = PyUnicode_KIND(str1);
9259 kind2 = PyUnicode_KIND(str2);
9260 data1 = PyUnicode_DATA(str1);
9261 data2 = PyUnicode_DATA(str2);
9262 len1 = PyUnicode_GET_LENGTH(str1);
9263 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 for (i = 0; i < len1 && i < len2; ++i) {
9266 Py_UCS4 c1, c2;
9267 c1 = PyUnicode_READ(kind1, data1, i);
9268 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009269
9270 if (c1 != c2)
9271 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009272 }
9273
9274 return (len1 < len2) ? -1 : (len1 != len2);
9275}
9276
9277#endif
9278
Alexander Belopolsky40018472011-02-26 01:02:56 +00009279int
9280PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9283 if (PyUnicode_READY(left) == -1 ||
9284 PyUnicode_READY(right) == -1)
9285 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009286 return unicode_compare((PyUnicodeObject *)left,
9287 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009289 PyErr_Format(PyExc_TypeError,
9290 "Can't compare %.100s and %.100s",
9291 left->ob_type->tp_name,
9292 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293 return -1;
9294}
9295
Martin v. Löwis5b222132007-06-10 09:51:05 +00009296int
9297PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 Py_ssize_t i;
9300 int kind;
9301 void *data;
9302 Py_UCS4 chr;
9303
Martin v. Löwis5b222132007-06-10 09:51:05 +00009304 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 if (PyUnicode_READY(uni) == -1)
9306 return -1;
9307 kind = PyUnicode_KIND(uni);
9308 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009309 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9311 if (chr != str[i])
9312 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009313 /* This check keeps Python strings that end in '\0' from comparing equal
9314 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009316 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009317 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009318 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009319 return 0;
9320}
9321
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009322
Benjamin Peterson29060642009-01-31 22:14:21 +00009323#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009324 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009325
Alexander Belopolsky40018472011-02-26 01:02:56 +00009326PyObject *
9327PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009328{
9329 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009330
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009331 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9332 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 if (PyUnicode_READY(left) == -1 ||
9334 PyUnicode_READY(right) == -1)
9335 return NULL;
9336 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9337 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009338 if (op == Py_EQ) {
9339 Py_INCREF(Py_False);
9340 return Py_False;
9341 }
9342 if (op == Py_NE) {
9343 Py_INCREF(Py_True);
9344 return Py_True;
9345 }
9346 }
9347 if (left == right)
9348 result = 0;
9349 else
9350 result = unicode_compare((PyUnicodeObject *)left,
9351 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009352
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009353 /* Convert the return value to a Boolean */
9354 switch (op) {
9355 case Py_EQ:
9356 v = TEST_COND(result == 0);
9357 break;
9358 case Py_NE:
9359 v = TEST_COND(result != 0);
9360 break;
9361 case Py_LE:
9362 v = TEST_COND(result <= 0);
9363 break;
9364 case Py_GE:
9365 v = TEST_COND(result >= 0);
9366 break;
9367 case Py_LT:
9368 v = TEST_COND(result == -1);
9369 break;
9370 case Py_GT:
9371 v = TEST_COND(result == 1);
9372 break;
9373 default:
9374 PyErr_BadArgument();
9375 return NULL;
9376 }
9377 Py_INCREF(v);
9378 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009379 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009380
Brian Curtindfc80e32011-08-10 20:28:54 -05009381 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009382}
9383
Alexander Belopolsky40018472011-02-26 01:02:56 +00009384int
9385PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009386{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009387 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 int kind1, kind2, kind;
9389 void *buf1, *buf2;
9390 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009391 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009392
9393 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009394 sub = PyUnicode_FromObject(element);
9395 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009396 PyErr_Format(PyExc_TypeError,
9397 "'in <string>' requires string as left operand, not %s",
9398 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009399 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 if (PyUnicode_READY(sub) == -1)
9402 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009403
Thomas Wouters477c8d52006-05-27 19:21:47 +00009404 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009406 Py_DECREF(sub);
9407 return -1;
9408 }
9409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 kind1 = PyUnicode_KIND(str);
9411 kind2 = PyUnicode_KIND(sub);
9412 kind = kind1 > kind2 ? kind1 : kind2;
9413 buf1 = PyUnicode_DATA(str);
9414 buf2 = PyUnicode_DATA(sub);
9415 if (kind1 != kind)
9416 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9417 if (!buf1) {
9418 Py_DECREF(sub);
9419 return -1;
9420 }
9421 if (kind2 != kind)
9422 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9423 if (!buf2) {
9424 Py_DECREF(sub);
9425 if (kind1 != kind) PyMem_Free(buf1);
9426 return -1;
9427 }
9428 len1 = PyUnicode_GET_LENGTH(str);
9429 len2 = PyUnicode_GET_LENGTH(sub);
9430
9431 switch(kind) {
9432 case PyUnicode_1BYTE_KIND:
9433 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9434 break;
9435 case PyUnicode_2BYTE_KIND:
9436 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9437 break;
9438 case PyUnicode_4BYTE_KIND:
9439 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9440 break;
9441 default:
9442 result = -1;
9443 assert(0);
9444 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009445
9446 Py_DECREF(str);
9447 Py_DECREF(sub);
9448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 if (kind1 != kind)
9450 PyMem_Free(buf1);
9451 if (kind2 != kind)
9452 PyMem_Free(buf2);
9453
Guido van Rossum403d68b2000-03-13 15:55:09 +00009454 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009455}
9456
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457/* Concat to string or Unicode object giving a new Unicode object. */
9458
Alexander Belopolsky40018472011-02-26 01:02:56 +00009459PyObject *
9460PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 PyObject *u = NULL, *v = NULL, *w;
9463 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009464
9465 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009468 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009471 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472
9473 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009479 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481 }
9482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9484 goto onError;
9485
9486 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009487 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 w = PyUnicode_New(
9491 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9492 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009494 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009495 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9496 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009497 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009498 v, 0,
9499 PyUnicode_GET_LENGTH(v)) < 0)
9500 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501 Py_DECREF(u);
9502 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506 Py_XDECREF(u);
9507 Py_XDECREF(v);
9508 return NULL;
9509}
9510
Walter Dörwald1ab83302007-05-18 17:15:44 +00009511void
9512PyUnicode_Append(PyObject **pleft, PyObject *right)
9513{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009514 PyObject *new;
9515 if (*pleft == NULL)
9516 return;
9517 if (right == NULL || !PyUnicode_Check(*pleft)) {
9518 Py_DECREF(*pleft);
9519 *pleft = NULL;
9520 return;
9521 }
9522 new = PyUnicode_Concat(*pleft, right);
9523 Py_DECREF(*pleft);
9524 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009525}
9526
9527void
9528PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9529{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009530 PyUnicode_Append(pleft, right);
9531 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009532}
9533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009534PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009537Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009538string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009539interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540
9541static PyObject *
9542unicode_count(PyUnicodeObject *self, PyObject *args)
9543{
9544 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009545 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009546 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009548 int kind1, kind2, kind;
9549 void *buf1, *buf2;
9550 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551
Jesus Ceaac451502011-04-20 17:09:23 +02009552 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9553 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009554 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 kind1 = PyUnicode_KIND(self);
9557 kind2 = PyUnicode_KIND(substring);
9558 kind = kind1 > kind2 ? kind1 : kind2;
9559 buf1 = PyUnicode_DATA(self);
9560 buf2 = PyUnicode_DATA(substring);
9561 if (kind1 != kind)
9562 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9563 if (!buf1) {
9564 Py_DECREF(substring);
9565 return NULL;
9566 }
9567 if (kind2 != kind)
9568 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9569 if (!buf2) {
9570 Py_DECREF(substring);
9571 if (kind1 != kind) PyMem_Free(buf1);
9572 return NULL;
9573 }
9574 len1 = PyUnicode_GET_LENGTH(self);
9575 len2 = PyUnicode_GET_LENGTH(substring);
9576
9577 ADJUST_INDICES(start, end, len1);
9578 switch(kind) {
9579 case PyUnicode_1BYTE_KIND:
9580 iresult = ucs1lib_count(
9581 ((Py_UCS1*)buf1) + start, end - start,
9582 buf2, len2, PY_SSIZE_T_MAX
9583 );
9584 break;
9585 case PyUnicode_2BYTE_KIND:
9586 iresult = ucs2lib_count(
9587 ((Py_UCS2*)buf1) + start, end - start,
9588 buf2, len2, PY_SSIZE_T_MAX
9589 );
9590 break;
9591 case PyUnicode_4BYTE_KIND:
9592 iresult = ucs4lib_count(
9593 ((Py_UCS4*)buf1) + start, end - start,
9594 buf2, len2, PY_SSIZE_T_MAX
9595 );
9596 break;
9597 default:
9598 assert(0); iresult = 0;
9599 }
9600
9601 result = PyLong_FromSsize_t(iresult);
9602
9603 if (kind1 != kind)
9604 PyMem_Free(buf1);
9605 if (kind2 != kind)
9606 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009607
9608 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009609
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610 return result;
9611}
9612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009613PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009614 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009615\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009616Encode S using the codec registered for encoding. Default encoding\n\
9617is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009618handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009619a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9620'xmlcharrefreplace' as well as any other name registered with\n\
9621codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622
9623static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009624unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009626 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627 char *encoding = NULL;
9628 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009629
Benjamin Peterson308d6372009-09-18 21:42:35 +00009630 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9631 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009633 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009634}
9635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009636PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009637 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638\n\
9639Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009640If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641
9642static PyObject*
9643unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9644{
9645 Py_UNICODE *e;
9646 Py_UNICODE *p;
9647 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009648 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650 PyUnicodeObject *u;
9651 int tabsize = 8;
9652
9653 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9657 return NULL;
9658
Thomas Wouters7e474022000-07-16 12:04:32 +00009659 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009660 i = 0; /* chars up to and including most recent \n or \r */
9661 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009662 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9663 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009665 if (tabsize > 0) {
9666 incr = tabsize - (j % tabsize); /* cannot overflow */
9667 if (j > PY_SSIZE_T_MAX - incr)
9668 goto overflow1;
9669 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009670 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009671 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009673 if (j > PY_SSIZE_T_MAX - 1)
9674 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675 j++;
9676 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009677 if (i > PY_SSIZE_T_MAX - j)
9678 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009680 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681 }
9682 }
9683
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009684 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009685 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009686
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687 /* Second pass: create output string and fill it */
9688 u = _PyUnicode_New(i + j);
9689 if (!u)
9690 return NULL;
9691
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009692 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 q = _PyUnicode_WSTR(u); /* next output char */
9694 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009698 if (tabsize > 0) {
9699 i = tabsize - (j % tabsize);
9700 j += i;
9701 while (i--) {
9702 if (q >= qe)
9703 goto overflow2;
9704 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009705 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009706 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009707 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009708 else {
9709 if (q >= qe)
9710 goto overflow2;
9711 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009712 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713 if (*p == '\n' || *p == '\r')
9714 j = 0;
9715 }
9716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 if (PyUnicode_READY(u) == -1) {
9718 Py_DECREF(u);
9719 return NULL;
9720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009722
9723 overflow2:
9724 Py_DECREF(u);
9725 overflow1:
9726 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9727 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728}
9729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009730PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009731 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009732\n\
9733Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009734such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735arguments start and end are interpreted as in slice notation.\n\
9736\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009737Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738
9739static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741{
Jesus Ceaac451502011-04-20 17:09:23 +02009742 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009743 Py_ssize_t start;
9744 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009745 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009746
Jesus Ceaac451502011-04-20 17:09:23 +02009747 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9748 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751 if (PyUnicode_READY(self) == -1)
9752 return NULL;
9753 if (PyUnicode_READY(substring) == -1)
9754 return NULL;
9755
9756 result = any_find_slice(
9757 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9758 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009759 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760
9761 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763 if (result == -2)
9764 return NULL;
9765
Christian Heimes217cfd12007-12-02 14:31:20 +00009766 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767}
9768
9769static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009770unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 Py_UCS4 ch;
9773
9774 if (PyUnicode_READY(self) == -1)
9775 return NULL;
9776 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009777 PyErr_SetString(PyExc_IndexError, "string index out of range");
9778 return NULL;
9779 }
9780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9782 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009783}
9784
Guido van Rossumc2504932007-09-18 19:42:40 +00009785/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009786 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009787static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009788unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789{
Guido van Rossumc2504932007-09-18 19:42:40 +00009790 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009791 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 if (_PyUnicode_HASH(self) != -1)
9794 return _PyUnicode_HASH(self);
9795 if (PyUnicode_READY(self) == -1)
9796 return -1;
9797 len = PyUnicode_GET_LENGTH(self);
9798
9799 /* The hash function as a macro, gets expanded three times below. */
9800#define HASH(P) \
9801 x = (Py_uhash_t)*P << 7; \
9802 while (--len >= 0) \
9803 x = (1000003*x) ^ (Py_uhash_t)*P++;
9804
9805 switch (PyUnicode_KIND(self)) {
9806 case PyUnicode_1BYTE_KIND: {
9807 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9808 HASH(c);
9809 break;
9810 }
9811 case PyUnicode_2BYTE_KIND: {
9812 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9813 HASH(s);
9814 break;
9815 }
9816 default: {
9817 Py_UCS4 *l;
9818 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9819 "Impossible switch case in unicode_hash");
9820 l = PyUnicode_4BYTE_DATA(self);
9821 HASH(l);
9822 break;
9823 }
9824 }
9825 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9826
Guido van Rossumc2504932007-09-18 19:42:40 +00009827 if (x == -1)
9828 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009830 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009834PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009835 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009837Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009838
9839static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009842 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009843 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009844 Py_ssize_t start;
9845 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846
Jesus Ceaac451502011-04-20 17:09:23 +02009847 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9848 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 if (PyUnicode_READY(self) == -1)
9852 return NULL;
9853 if (PyUnicode_READY(substring) == -1)
9854 return NULL;
9855
9856 result = any_find_slice(
9857 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9858 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009859 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860
9861 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 if (result == -2)
9864 return NULL;
9865
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866 if (result < 0) {
9867 PyErr_SetString(PyExc_ValueError, "substring not found");
9868 return NULL;
9869 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009870
Christian Heimes217cfd12007-12-02 14:31:20 +00009871 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872}
9873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009874PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009875 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009876\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009877Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009878at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879
9880static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009881unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 Py_ssize_t i, length;
9884 int kind;
9885 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886 int cased;
9887
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009888 if (PyUnicode_READY(self) == -1)
9889 return NULL;
9890 length = PyUnicode_GET_LENGTH(self);
9891 kind = PyUnicode_KIND(self);
9892 data = PyUnicode_DATA(self);
9893
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 if (length == 1)
9896 return PyBool_FromLong(
9897 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009899 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009901 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009902
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 for (i = 0; i < length; i++) {
9905 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009906
Benjamin Peterson29060642009-01-31 22:14:21 +00009907 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9908 return PyBool_FromLong(0);
9909 else if (!cased && Py_UNICODE_ISLOWER(ch))
9910 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009912 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913}
9914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009915PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009916 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009918Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009919at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920
9921static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009922unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 Py_ssize_t i, length;
9925 int kind;
9926 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927 int cased;
9928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 if (PyUnicode_READY(self) == -1)
9930 return NULL;
9931 length = PyUnicode_GET_LENGTH(self);
9932 kind = PyUnicode_KIND(self);
9933 data = PyUnicode_DATA(self);
9934
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 if (length == 1)
9937 return PyBool_FromLong(
9938 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009940 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009942 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009943
Guido van Rossumd57fd912000-03-10 22:53:23 +00009944 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 for (i = 0; i < length; i++) {
9946 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009947
Benjamin Peterson29060642009-01-31 22:14:21 +00009948 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9949 return PyBool_FromLong(0);
9950 else if (!cased && Py_UNICODE_ISUPPER(ch))
9951 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009953 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009954}
9955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009956PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009957 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009959Return True if S is a titlecased string and there is at least one\n\
9960character in S, i.e. upper- and titlecase characters may only\n\
9961follow uncased characters and lowercase characters only cased ones.\n\
9962Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963
9964static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009965unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 Py_ssize_t i, length;
9968 int kind;
9969 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970 int cased, previous_is_cased;
9971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 if (PyUnicode_READY(self) == -1)
9973 return NULL;
9974 length = PyUnicode_GET_LENGTH(self);
9975 kind = PyUnicode_KIND(self);
9976 data = PyUnicode_DATA(self);
9977
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 if (length == 1) {
9980 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
9981 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
9982 (Py_UNICODE_ISUPPER(ch) != 0));
9983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009985 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009987 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009988
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989 cased = 0;
9990 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 for (i = 0; i < length; i++) {
9992 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009993
Benjamin Peterson29060642009-01-31 22:14:21 +00009994 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
9995 if (previous_is_cased)
9996 return PyBool_FromLong(0);
9997 previous_is_cased = 1;
9998 cased = 1;
9999 }
10000 else if (Py_UNICODE_ISLOWER(ch)) {
10001 if (!previous_is_cased)
10002 return PyBool_FromLong(0);
10003 previous_is_cased = 1;
10004 cased = 1;
10005 }
10006 else
10007 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010009 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010}
10011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010012PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010013 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010015Return True if all characters in S are whitespace\n\
10016and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017
10018static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010019unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 Py_ssize_t i, length;
10022 int kind;
10023 void *data;
10024
10025 if (PyUnicode_READY(self) == -1)
10026 return NULL;
10027 length = PyUnicode_GET_LENGTH(self);
10028 kind = PyUnicode_KIND(self);
10029 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 if (length == 1)
10033 return PyBool_FromLong(
10034 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010036 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010038 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 for (i = 0; i < length; i++) {
10041 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010042 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010043 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010045 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046}
10047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010048PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010049 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010050\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010051Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010052and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010053
10054static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010055unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010056{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 Py_ssize_t i, length;
10058 int kind;
10059 void *data;
10060
10061 if (PyUnicode_READY(self) == -1)
10062 return NULL;
10063 length = PyUnicode_GET_LENGTH(self);
10064 kind = PyUnicode_KIND(self);
10065 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010066
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010067 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 if (length == 1)
10069 return PyBool_FromLong(
10070 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010071
10072 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010074 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 for (i = 0; i < length; i++) {
10077 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010078 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010079 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010080 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010081}
10082
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010083PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010084 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010085\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010086Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010087and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010088
10089static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010090unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 int kind;
10093 void *data;
10094 Py_ssize_t len, i;
10095
10096 if (PyUnicode_READY(self) == -1)
10097 return NULL;
10098
10099 kind = PyUnicode_KIND(self);
10100 data = PyUnicode_DATA(self);
10101 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010102
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010103 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 if (len == 1) {
10105 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10106 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10107 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010108
10109 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010111 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 for (i = 0; i < len; i++) {
10114 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010115 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010116 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010117 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010118 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010119}
10120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010121PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010122 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010124Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010125False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010126
10127static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010128unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 Py_ssize_t i, length;
10131 int kind;
10132 void *data;
10133
10134 if (PyUnicode_READY(self) == -1)
10135 return NULL;
10136 length = PyUnicode_GET_LENGTH(self);
10137 kind = PyUnicode_KIND(self);
10138 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 if (length == 1)
10142 return PyBool_FromLong(
10143 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010145 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010147 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 for (i = 0; i < length; i++) {
10150 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010151 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010153 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154}
10155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010156PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010157 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010159Return True if all characters in S are digits\n\
10160and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161
10162static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010163unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 Py_ssize_t i, length;
10166 int kind;
10167 void *data;
10168
10169 if (PyUnicode_READY(self) == -1)
10170 return NULL;
10171 length = PyUnicode_GET_LENGTH(self);
10172 kind = PyUnicode_KIND(self);
10173 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 if (length == 1) {
10177 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10178 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010181 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010183 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 for (i = 0; i < length; i++) {
10186 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010187 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010189 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010190}
10191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010192PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010193 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010195Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010196False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197
10198static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010199unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 Py_ssize_t i, length;
10202 int kind;
10203 void *data;
10204
10205 if (PyUnicode_READY(self) == -1)
10206 return NULL;
10207 length = PyUnicode_GET_LENGTH(self);
10208 kind = PyUnicode_KIND(self);
10209 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 if (length == 1)
10213 return PyBool_FromLong(
10214 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010215
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010216 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010218 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 for (i = 0; i < length; i++) {
10221 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010222 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010224 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225}
10226
Martin v. Löwis47383402007-08-15 07:32:56 +000010227int
10228PyUnicode_IsIdentifier(PyObject *self)
10229{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 int kind;
10231 void *data;
10232 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010233 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 if (PyUnicode_READY(self) == -1) {
10236 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010237 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 }
10239
10240 /* Special case for empty strings */
10241 if (PyUnicode_GET_LENGTH(self) == 0)
10242 return 0;
10243 kind = PyUnicode_KIND(self);
10244 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010245
10246 /* PEP 3131 says that the first character must be in
10247 XID_Start and subsequent characters in XID_Continue,
10248 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010249 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010250 letters, digits, underscore). However, given the current
10251 definition of XID_Start and XID_Continue, it is sufficient
10252 to check just for these, except that _ must be allowed
10253 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010255 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010256 return 0;
10257
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010258 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010260 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010261 return 1;
10262}
10263
10264PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010265 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010266\n\
10267Return True if S is a valid identifier according\n\
10268to the language definition.");
10269
10270static PyObject*
10271unicode_isidentifier(PyObject *self)
10272{
10273 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10274}
10275
Georg Brandl559e5d72008-06-11 18:37:52 +000010276PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010277 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010278\n\
10279Return True if all characters in S are considered\n\
10280printable in repr() or S is empty, False otherwise.");
10281
10282static PyObject*
10283unicode_isprintable(PyObject *self)
10284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 Py_ssize_t i, length;
10286 int kind;
10287 void *data;
10288
10289 if (PyUnicode_READY(self) == -1)
10290 return NULL;
10291 length = PyUnicode_GET_LENGTH(self);
10292 kind = PyUnicode_KIND(self);
10293 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010294
10295 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 if (length == 1)
10297 return PyBool_FromLong(
10298 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 for (i = 0; i < length; i++) {
10301 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010302 Py_RETURN_FALSE;
10303 }
10304 }
10305 Py_RETURN_TRUE;
10306}
10307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010308PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010309 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310\n\
10311Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010312iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313
10314static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010315unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010317 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318}
10319
Martin v. Löwis18e16552006-02-15 17:27:45 +000010320static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321unicode_length(PyUnicodeObject *self)
10322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 if (PyUnicode_READY(self) == -1)
10324 return -1;
10325 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326}
10327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010328PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010329 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010331Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010332done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333
10334static PyObject *
10335unicode_ljust(PyUnicodeObject *self, PyObject *args)
10336{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010337 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 Py_UCS4 fillchar = ' ';
10339
10340 if (PyUnicode_READY(self) == -1)
10341 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010342
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010343 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344 return NULL;
10345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347 Py_INCREF(self);
10348 return (PyObject*) self;
10349 }
10350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352}
10353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010354PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010355 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010357Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358
10359static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010360unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362 return fixup(self, fixlower);
10363}
10364
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010365#define LEFTSTRIP 0
10366#define RIGHTSTRIP 1
10367#define BOTHSTRIP 2
10368
10369/* Arrays indexed by above */
10370static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10371
10372#define STRIPNAME(i) (stripformat[i]+3)
10373
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010374/* externally visible for str.strip(unicode) */
10375PyObject *
10376_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10377{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 void *data;
10379 int kind;
10380 Py_ssize_t i, j, len;
10381 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10384 return NULL;
10385
10386 kind = PyUnicode_KIND(self);
10387 data = PyUnicode_DATA(self);
10388 len = PyUnicode_GET_LENGTH(self);
10389 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10390 PyUnicode_DATA(sepobj),
10391 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010392
Benjamin Peterson14339b62009-01-31 16:36:08 +000010393 i = 0;
10394 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 while (i < len &&
10396 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010397 i++;
10398 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010399 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010400
Benjamin Peterson14339b62009-01-31 16:36:08 +000010401 j = len;
10402 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010403 do {
10404 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 } while (j >= i &&
10406 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010407 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010408 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010409
Benjamin Peterson14339b62009-01-31 16:36:08 +000010410 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010411 Py_INCREF(self);
10412 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010413 }
10414 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010416}
10417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418/* Assumes an already ready self string. */
10419
10420static PyObject *
10421substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10422{
10423 const int kind = PyUnicode_KIND(self);
10424 void *data = PyUnicode_DATA(self);
10425 Py_UCS4 maxchar = 0;
10426 Py_ssize_t i;
10427 PyObject *unicode;
10428
10429 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10430 PyErr_BadInternalCall();
10431 return NULL;
10432 }
10433
10434 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10435 Py_INCREF(self);
10436 return (PyObject*)self;
10437 }
10438
10439 for (i = 0; i < len; ++i) {
10440 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10441 if (ch > maxchar)
10442 maxchar = ch;
10443 }
10444
10445 unicode = PyUnicode_New(len, maxchar);
10446 if (unicode == NULL)
10447 return NULL;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010448 if (PyUnicode_CopyCharacters(unicode, 0,
10449 (PyObject*)self, start, len) < 0)
10450 {
10451 Py_DECREF(unicode);
10452 return NULL;
10453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 return unicode;
10455}
10456
10457PyObject*
10458PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10459{
10460 unsigned char *data;
10461 int kind;
10462
10463 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10464 && PyUnicode_CheckExact(self))
10465 {
10466 Py_INCREF(self);
10467 return (PyObject *)self;
10468 }
10469
10470 if ((end - start) == 1)
10471 return unicode_getitem((PyUnicodeObject*)self, start);
10472
10473 if (PyUnicode_READY(self) == -1)
10474 return NULL;
10475 kind = PyUnicode_KIND(self);
10476 data = PyUnicode_1BYTE_DATA(self);
10477 return PyUnicode_FromKindAndData(kind, data + PyUnicode_KIND_SIZE(kind, start),
10478 end-start);
10479}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480
10481static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010482do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 int kind;
10485 void *data;
10486 Py_ssize_t len, i, j;
10487
10488 if (PyUnicode_READY(self) == -1)
10489 return NULL;
10490
10491 kind = PyUnicode_KIND(self);
10492 data = PyUnicode_DATA(self);
10493 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010494
Benjamin Peterson14339b62009-01-31 16:36:08 +000010495 i = 0;
10496 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010498 i++;
10499 }
10500 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010501
Benjamin Peterson14339b62009-01-31 16:36:08 +000010502 j = len;
10503 if (striptype != LEFTSTRIP) {
10504 do {
10505 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010507 j++;
10508 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010509
Benjamin Peterson14339b62009-01-31 16:36:08 +000010510 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10511 Py_INCREF(self);
10512 return (PyObject*)self;
10513 }
10514 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516}
10517
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010518
10519static PyObject *
10520do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10521{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010522 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010523
Benjamin Peterson14339b62009-01-31 16:36:08 +000010524 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10525 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010526
Benjamin Peterson14339b62009-01-31 16:36:08 +000010527 if (sep != NULL && sep != Py_None) {
10528 if (PyUnicode_Check(sep))
10529 return _PyUnicode_XStrip(self, striptype, sep);
10530 else {
10531 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010532 "%s arg must be None or str",
10533 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010534 return NULL;
10535 }
10536 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010537
Benjamin Peterson14339b62009-01-31 16:36:08 +000010538 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010539}
10540
10541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010542PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010543 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010544\n\
10545Return a copy of the string S with leading and trailing\n\
10546whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010547If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010548
10549static PyObject *
10550unicode_strip(PyUnicodeObject *self, PyObject *args)
10551{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010552 if (PyTuple_GET_SIZE(args) == 0)
10553 return do_strip(self, BOTHSTRIP); /* Common case */
10554 else
10555 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010556}
10557
10558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010559PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010560 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010561\n\
10562Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010563If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010564
10565static PyObject *
10566unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10567{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010568 if (PyTuple_GET_SIZE(args) == 0)
10569 return do_strip(self, LEFTSTRIP); /* Common case */
10570 else
10571 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010572}
10573
10574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010575PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010576 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010577\n\
10578Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010579If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010580
10581static PyObject *
10582unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10583{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010584 if (PyTuple_GET_SIZE(args) == 0)
10585 return do_strip(self, RIGHTSTRIP); /* Common case */
10586 else
10587 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010588}
10589
10590
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010592unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593{
10594 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 Py_ssize_t nchars, n;
10596 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597
Georg Brandl222de0f2009-04-12 12:01:50 +000010598 if (len < 1) {
10599 Py_INCREF(unicode_empty);
10600 return (PyObject *)unicode_empty;
10601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602
Tim Peters7a29bd52001-09-12 03:03:31 +000010603 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604 /* no repeat, return original string */
10605 Py_INCREF(str);
10606 return (PyObject*) str;
10607 }
Tim Peters8f422462000-09-09 06:13:41 +000010608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 if (PyUnicode_READY(str) == -1)
10610 return NULL;
10611
Tim Peters8f422462000-09-09 06:13:41 +000010612 /* ensure # of chars needed doesn't overflow int and # of bytes
10613 * needed doesn't overflow size_t
10614 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 nchars = len * PyUnicode_GET_LENGTH(str);
10616 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010617 PyErr_SetString(PyExc_OverflowError,
10618 "repeated string is too long");
10619 return NULL;
10620 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 char_size = PyUnicode_CHARACTER_SIZE(str);
10622 nbytes = (nchars + 1) * char_size;
10623 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010624 PyErr_SetString(PyExc_OverflowError,
10625 "repeated string is too long");
10626 return NULL;
10627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010629 if (!u)
10630 return NULL;
10631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 if (PyUnicode_GET_LENGTH(str) == 1) {
10633 const int kind = PyUnicode_KIND(str);
10634 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10635 void *to = PyUnicode_DATA(u);
10636 for (n = 0; n < len; ++n)
10637 PyUnicode_WRITE(kind, to, n, fill_char);
10638 }
10639 else {
10640 /* number of characters copied this far */
10641 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10642 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10643 char *to = (char *) PyUnicode_DATA(u);
10644 Py_MEMCPY(to, PyUnicode_DATA(str),
10645 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 n = (done <= nchars-done) ? done : nchars-done;
10648 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010651 }
10652
10653 return (PyObject*) u;
10654}
10655
Alexander Belopolsky40018472011-02-26 01:02:56 +000010656PyObject *
10657PyUnicode_Replace(PyObject *obj,
10658 PyObject *subobj,
10659 PyObject *replobj,
10660 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010661{
10662 PyObject *self;
10663 PyObject *str1;
10664 PyObject *str2;
10665 PyObject *result;
10666
10667 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010672 Py_DECREF(self);
10673 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674 }
10675 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010677 Py_DECREF(self);
10678 Py_DECREF(str1);
10679 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682 Py_DECREF(self);
10683 Py_DECREF(str1);
10684 Py_DECREF(str2);
10685 return result;
10686}
10687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010688PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010689 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690\n\
10691Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010692old replaced by new. If the optional argument count is\n\
10693given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694
10695static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 PyObject *str1;
10699 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010700 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701 PyObject *result;
10702
Martin v. Löwis18e16552006-02-15 17:27:45 +000010703 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010706 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 str1 = PyUnicode_FromObject(str1);
10708 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10709 return NULL;
10710 str2 = PyUnicode_FromObject(str2);
10711 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010712 Py_DECREF(str1);
10713 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010714 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715
10716 result = replace(self, str1, str2, maxcount);
10717
10718 Py_DECREF(str1);
10719 Py_DECREF(str2);
10720 return result;
10721}
10722
Alexander Belopolsky40018472011-02-26 01:02:56 +000010723static PyObject *
10724unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010726 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 Py_ssize_t isize;
10728 Py_ssize_t osize, squote, dquote, i, o;
10729 Py_UCS4 max, quote;
10730 int ikind, okind;
10731 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010734 return NULL;
10735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 isize = PyUnicode_GET_LENGTH(unicode);
10737 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 /* Compute length of output, quote characters, and
10740 maximum character */
10741 osize = 2; /* quotes */
10742 max = 127;
10743 squote = dquote = 0;
10744 ikind = PyUnicode_KIND(unicode);
10745 for (i = 0; i < isize; i++) {
10746 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10747 switch (ch) {
10748 case '\'': squote++; osize++; break;
10749 case '"': dquote++; osize++; break;
10750 case '\\': case '\t': case '\r': case '\n':
10751 osize += 2; break;
10752 default:
10753 /* Fast-path ASCII */
10754 if (ch < ' ' || ch == 0x7f)
10755 osize += 4; /* \xHH */
10756 else if (ch < 0x7f)
10757 osize++;
10758 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10759 osize++;
10760 max = ch > max ? ch : max;
10761 }
10762 else if (ch < 0x100)
10763 osize += 4; /* \xHH */
10764 else if (ch < 0x10000)
10765 osize += 6; /* \uHHHH */
10766 else
10767 osize += 10; /* \uHHHHHHHH */
10768 }
10769 }
10770
10771 quote = '\'';
10772 if (squote) {
10773 if (dquote)
10774 /* Both squote and dquote present. Use squote,
10775 and escape them */
10776 osize += squote;
10777 else
10778 quote = '"';
10779 }
10780
10781 repr = PyUnicode_New(osize, max);
10782 if (repr == NULL)
10783 return NULL;
10784 okind = PyUnicode_KIND(repr);
10785 odata = PyUnicode_DATA(repr);
10786
10787 PyUnicode_WRITE(okind, odata, 0, quote);
10788 PyUnicode_WRITE(okind, odata, osize-1, quote);
10789
10790 for (i = 0, o = 1; i < isize; i++) {
10791 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010792
10793 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 if ((ch == quote) || (ch == '\\')) {
10795 PyUnicode_WRITE(okind, odata, o++, '\\');
10796 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010797 continue;
10798 }
10799
Benjamin Peterson29060642009-01-31 22:14:21 +000010800 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010801 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 PyUnicode_WRITE(okind, odata, o++, '\\');
10803 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010804 }
10805 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 PyUnicode_WRITE(okind, odata, o++, '\\');
10807 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010808 }
10809 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 PyUnicode_WRITE(okind, odata, o++, '\\');
10811 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010812 }
10813
10814 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010815 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 PyUnicode_WRITE(okind, odata, o++, '\\');
10817 PyUnicode_WRITE(okind, odata, o++, 'x');
10818 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10819 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010820 }
10821
Georg Brandl559e5d72008-06-11 18:37:52 +000010822 /* Copy ASCII characters as-is */
10823 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010825 }
10826
Benjamin Peterson29060642009-01-31 22:14:21 +000010827 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010828 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010829 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010830 (categories Z* and C* except ASCII space)
10831 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010833 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 if (ch <= 0xff) {
10835 PyUnicode_WRITE(okind, odata, o++, '\\');
10836 PyUnicode_WRITE(okind, odata, o++, 'x');
10837 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10838 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010839 }
10840 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 else if (ch >= 0x10000) {
10842 PyUnicode_WRITE(okind, odata, o++, '\\');
10843 PyUnicode_WRITE(okind, odata, o++, 'U');
10844 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10845 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10846 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10847 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10848 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10849 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10850 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10851 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010852 }
10853 /* Map 16-bit characters to '\uxxxx' */
10854 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 PyUnicode_WRITE(okind, odata, o++, '\\');
10856 PyUnicode_WRITE(okind, odata, o++, 'u');
10857 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10858 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10859 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10860 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010861 }
10862 }
10863 /* Copy characters as-is */
10864 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010866 }
10867 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010870 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871}
10872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010873PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010874 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875\n\
10876Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010877such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878arguments start and end are interpreted as in slice notation.\n\
10879\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010880Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010881
10882static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010884{
Jesus Ceaac451502011-04-20 17:09:23 +020010885 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010886 Py_ssize_t start;
10887 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010888 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889
Jesus Ceaac451502011-04-20 17:09:23 +020010890 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10891 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010892 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 if (PyUnicode_READY(self) == -1)
10895 return NULL;
10896 if (PyUnicode_READY(substring) == -1)
10897 return NULL;
10898
10899 result = any_find_slice(
10900 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10901 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010902 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903
10904 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 if (result == -2)
10907 return NULL;
10908
Christian Heimes217cfd12007-12-02 14:31:20 +000010909 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910}
10911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010912PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010913 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010915Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916
10917static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919{
Jesus Ceaac451502011-04-20 17:09:23 +020010920 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010921 Py_ssize_t start;
10922 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010923 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924
Jesus Ceaac451502011-04-20 17:09:23 +020010925 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10926 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010927 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 if (PyUnicode_READY(self) == -1)
10930 return NULL;
10931 if (PyUnicode_READY(substring) == -1)
10932 return NULL;
10933
10934 result = any_find_slice(
10935 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10936 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010937 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938
10939 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 if (result == -2)
10942 return NULL;
10943
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944 if (result < 0) {
10945 PyErr_SetString(PyExc_ValueError, "substring not found");
10946 return NULL;
10947 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948
Christian Heimes217cfd12007-12-02 14:31:20 +000010949 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950}
10951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010952PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010953 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010955Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010956done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957
10958static PyObject *
10959unicode_rjust(PyUnicodeObject *self, PyObject *args)
10960{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010961 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 Py_UCS4 fillchar = ' ';
10963
10964 if (PyUnicode_READY(self) == -1)
10965 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010966
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010967 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968 return NULL;
10969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971 Py_INCREF(self);
10972 return (PyObject*) self;
10973 }
10974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976}
10977
Alexander Belopolsky40018472011-02-26 01:02:56 +000010978PyObject *
10979PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980{
10981 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010982
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983 s = PyUnicode_FromObject(s);
10984 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010985 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010986 if (sep != NULL) {
10987 sep = PyUnicode_FromObject(sep);
10988 if (sep == NULL) {
10989 Py_DECREF(s);
10990 return NULL;
10991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 }
10993
10994 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10995
10996 Py_DECREF(s);
10997 Py_XDECREF(sep);
10998 return result;
10999}
11000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011001PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011002 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003\n\
11004Return a list of the words in S, using sep as the\n\
11005delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011006splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011007whitespace string is a separator and empty strings are\n\
11008removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009
11010static PyObject*
11011unicode_split(PyUnicodeObject *self, PyObject *args)
11012{
11013 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011014 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015
Martin v. Löwis18e16552006-02-15 17:27:45 +000011016 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 return NULL;
11018
11019 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011024 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025}
11026
Thomas Wouters477c8d52006-05-27 19:21:47 +000011027PyObject *
11028PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11029{
11030 PyObject* str_obj;
11031 PyObject* sep_obj;
11032 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 int kind1, kind2, kind;
11034 void *buf1 = NULL, *buf2 = NULL;
11035 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011036
11037 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011039 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011040 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011042 Py_DECREF(str_obj);
11043 return NULL;
11044 }
11045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046 kind1 = PyUnicode_KIND(str_in);
11047 kind2 = PyUnicode_KIND(sep_obj);
11048 kind = kind1 > kind2 ? kind1 : kind2;
11049 buf1 = PyUnicode_DATA(str_in);
11050 if (kind1 != kind)
11051 buf1 = _PyUnicode_AsKind(str_in, kind);
11052 if (!buf1)
11053 goto onError;
11054 buf2 = PyUnicode_DATA(sep_obj);
11055 if (kind2 != kind)
11056 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11057 if (!buf2)
11058 goto onError;
11059 len1 = PyUnicode_GET_LENGTH(str_obj);
11060 len2 = PyUnicode_GET_LENGTH(sep_obj);
11061
11062 switch(PyUnicode_KIND(str_in)) {
11063 case PyUnicode_1BYTE_KIND:
11064 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11065 break;
11066 case PyUnicode_2BYTE_KIND:
11067 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11068 break;
11069 case PyUnicode_4BYTE_KIND:
11070 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11071 break;
11072 default:
11073 assert(0);
11074 out = 0;
11075 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011076
11077 Py_DECREF(sep_obj);
11078 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011079 if (kind1 != kind)
11080 PyMem_Free(buf1);
11081 if (kind2 != kind)
11082 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011083
11084 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 onError:
11086 Py_DECREF(sep_obj);
11087 Py_DECREF(str_obj);
11088 if (kind1 != kind && buf1)
11089 PyMem_Free(buf1);
11090 if (kind2 != kind && buf2)
11091 PyMem_Free(buf2);
11092 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011093}
11094
11095
11096PyObject *
11097PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11098{
11099 PyObject* str_obj;
11100 PyObject* sep_obj;
11101 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102 int kind1, kind2, kind;
11103 void *buf1 = NULL, *buf2 = NULL;
11104 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011105
11106 str_obj = PyUnicode_FromObject(str_in);
11107 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011108 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011109 sep_obj = PyUnicode_FromObject(sep_in);
11110 if (!sep_obj) {
11111 Py_DECREF(str_obj);
11112 return NULL;
11113 }
11114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 kind1 = PyUnicode_KIND(str_in);
11116 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011117 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 buf1 = PyUnicode_DATA(str_in);
11119 if (kind1 != kind)
11120 buf1 = _PyUnicode_AsKind(str_in, kind);
11121 if (!buf1)
11122 goto onError;
11123 buf2 = PyUnicode_DATA(sep_obj);
11124 if (kind2 != kind)
11125 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11126 if (!buf2)
11127 goto onError;
11128 len1 = PyUnicode_GET_LENGTH(str_obj);
11129 len2 = PyUnicode_GET_LENGTH(sep_obj);
11130
11131 switch(PyUnicode_KIND(str_in)) {
11132 case PyUnicode_1BYTE_KIND:
11133 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11134 break;
11135 case PyUnicode_2BYTE_KIND:
11136 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11137 break;
11138 case PyUnicode_4BYTE_KIND:
11139 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11140 break;
11141 default:
11142 assert(0);
11143 out = 0;
11144 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011145
11146 Py_DECREF(sep_obj);
11147 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 if (kind1 != kind)
11149 PyMem_Free(buf1);
11150 if (kind2 != kind)
11151 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011152
11153 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 onError:
11155 Py_DECREF(sep_obj);
11156 Py_DECREF(str_obj);
11157 if (kind1 != kind && buf1)
11158 PyMem_Free(buf1);
11159 if (kind2 != kind && buf2)
11160 PyMem_Free(buf2);
11161 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011162}
11163
11164PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011165 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011167Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011168the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011169found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011170
11171static PyObject*
11172unicode_partition(PyUnicodeObject *self, PyObject *separator)
11173{
11174 return PyUnicode_Partition((PyObject *)self, separator);
11175}
11176
11177PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011178 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011179\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011180Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011181the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011182separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011183
11184static PyObject*
11185unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11186{
11187 return PyUnicode_RPartition((PyObject *)self, separator);
11188}
11189
Alexander Belopolsky40018472011-02-26 01:02:56 +000011190PyObject *
11191PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011192{
11193 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011194
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011195 s = PyUnicode_FromObject(s);
11196 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011197 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011198 if (sep != NULL) {
11199 sep = PyUnicode_FromObject(sep);
11200 if (sep == NULL) {
11201 Py_DECREF(s);
11202 return NULL;
11203 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011204 }
11205
11206 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11207
11208 Py_DECREF(s);
11209 Py_XDECREF(sep);
11210 return result;
11211}
11212
11213PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011214 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011215\n\
11216Return a list of the words in S, using sep as the\n\
11217delimiter string, starting at the end of the string and\n\
11218working to the front. If maxsplit is given, at most maxsplit\n\
11219splits are done. If sep is not specified, any whitespace string\n\
11220is a separator.");
11221
11222static PyObject*
11223unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11224{
11225 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011226 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011227
Martin v. Löwis18e16552006-02-15 17:27:45 +000011228 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011229 return NULL;
11230
11231 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011232 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011233 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011234 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011235 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011237}
11238
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011239PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241\n\
11242Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011243Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011244is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245
11246static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011247unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011249 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011250 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011252 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11253 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254 return NULL;
11255
Guido van Rossum86662912000-04-11 15:38:46 +000011256 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257}
11258
11259static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011260PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261{
Walter Dörwald346737f2007-05-31 10:44:43 +000011262 if (PyUnicode_CheckExact(self)) {
11263 Py_INCREF(self);
11264 return self;
11265 } else
11266 /* Subtype -- return genuine unicode string with the same value. */
11267 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
11268 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269}
11270
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011271PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011272 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273\n\
11274Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011275and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276
11277static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011278unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280 return fixup(self, fixswapcase);
11281}
11282
Georg Brandlceee0772007-11-27 23:48:05 +000011283PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011284 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011285\n\
11286Return a translation table usable for str.translate().\n\
11287If there is only one argument, it must be a dictionary mapping Unicode\n\
11288ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011289Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011290If there are two arguments, they must be strings of equal length, and\n\
11291in the resulting dictionary, each character in x will be mapped to the\n\
11292character at the same position in y. If there is a third argument, it\n\
11293must be a string, whose characters will be mapped to None in the result.");
11294
11295static PyObject*
11296unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11297{
11298 PyObject *x, *y = NULL, *z = NULL;
11299 PyObject *new = NULL, *key, *value;
11300 Py_ssize_t i = 0;
11301 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011302
Georg Brandlceee0772007-11-27 23:48:05 +000011303 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11304 return NULL;
11305 new = PyDict_New();
11306 if (!new)
11307 return NULL;
11308 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 int x_kind, y_kind, z_kind;
11310 void *x_data, *y_data, *z_data;
11311
Georg Brandlceee0772007-11-27 23:48:05 +000011312 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011313 if (!PyUnicode_Check(x)) {
11314 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11315 "be a string if there is a second argument");
11316 goto err;
11317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011319 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11320 "arguments must have equal length");
11321 goto err;
11322 }
11323 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 x_kind = PyUnicode_KIND(x);
11325 y_kind = PyUnicode_KIND(y);
11326 x_data = PyUnicode_DATA(x);
11327 y_data = PyUnicode_DATA(y);
11328 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11329 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11330 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011331 if (!key || !value)
11332 goto err;
11333 res = PyDict_SetItem(new, key, value);
11334 Py_DECREF(key);
11335 Py_DECREF(value);
11336 if (res < 0)
11337 goto err;
11338 }
11339 /* create entries for deleting chars in z */
11340 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 z_kind = PyUnicode_KIND(z);
11342 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011343 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011345 if (!key)
11346 goto err;
11347 res = PyDict_SetItem(new, key, Py_None);
11348 Py_DECREF(key);
11349 if (res < 0)
11350 goto err;
11351 }
11352 }
11353 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 int kind;
11355 void *data;
11356
Georg Brandlceee0772007-11-27 23:48:05 +000011357 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011358 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011359 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11360 "to maketrans it must be a dict");
11361 goto err;
11362 }
11363 /* copy entries into the new dict, converting string keys to int keys */
11364 while (PyDict_Next(x, &i, &key, &value)) {
11365 if (PyUnicode_Check(key)) {
11366 /* convert string keys to integer keys */
11367 PyObject *newkey;
11368 if (PyUnicode_GET_SIZE(key) != 1) {
11369 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11370 "table must be of length 1");
11371 goto err;
11372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 kind = PyUnicode_KIND(key);
11374 data = PyUnicode_DATA(key);
11375 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011376 if (!newkey)
11377 goto err;
11378 res = PyDict_SetItem(new, newkey, value);
11379 Py_DECREF(newkey);
11380 if (res < 0)
11381 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011382 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011383 /* just keep integer keys */
11384 if (PyDict_SetItem(new, key, value) < 0)
11385 goto err;
11386 } else {
11387 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11388 "be strings or integers");
11389 goto err;
11390 }
11391 }
11392 }
11393 return new;
11394 err:
11395 Py_DECREF(new);
11396 return NULL;
11397}
11398
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011399PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011400 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401\n\
11402Return a copy of the string S, where all characters have been mapped\n\
11403through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011404Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011405Unmapped characters are left untouched. Characters mapped to None\n\
11406are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407
11408static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412}
11413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011417Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
11419static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011420unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422 return fixup(self, fixupper);
11423}
11424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011425PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011426 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011428Pad a numeric string S with zeros on the left, to fill a field\n\
11429of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
11431static PyObject *
11432unicode_zfill(PyUnicodeObject *self, PyObject *args)
11433{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011434 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011436 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437 int kind;
11438 void *data;
11439 Py_UCS4 chr;
11440
11441 if (PyUnicode_READY(self) == -1)
11442 return NULL;
11443
Martin v. Löwis18e16552006-02-15 17:27:45 +000011444 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445 return NULL;
11446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011448 if (PyUnicode_CheckExact(self)) {
11449 Py_INCREF(self);
11450 return (PyObject*) self;
11451 }
11452 else
11453 return PyUnicode_FromUnicode(
11454 PyUnicode_AS_UNICODE(self),
11455 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457 }
11458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460
11461 u = pad(self, fill, 0, '0');
11462
Walter Dörwald068325e2002-04-15 13:36:47 +000011463 if (u == NULL)
11464 return NULL;
11465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 kind = PyUnicode_KIND(u);
11467 data = PyUnicode_DATA(u);
11468 chr = PyUnicode_READ(kind, data, fill);
11469
11470 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 PyUnicode_WRITE(kind, data, 0, chr);
11473 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474 }
11475
11476 return (PyObject*) u;
11477}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478
11479#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011480static PyObject *
11481unicode__decimal2ascii(PyObject *self)
11482{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011484}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485#endif
11486
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011487PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011490Return True if S starts with the specified prefix, False otherwise.\n\
11491With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011492With optional end, stop comparing S at that position.\n\
11493prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494
11495static PyObject *
11496unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011497 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011499 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011501 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011502 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011503 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
Jesus Ceaac451502011-04-20 17:09:23 +020011505 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011507 if (PyTuple_Check(subobj)) {
11508 Py_ssize_t i;
11509 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11510 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011512 if (substring == NULL)
11513 return NULL;
11514 result = tailmatch(self, substring, start, end, -1);
11515 Py_DECREF(substring);
11516 if (result) {
11517 Py_RETURN_TRUE;
11518 }
11519 }
11520 /* nothing matched */
11521 Py_RETURN_FALSE;
11522 }
11523 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011524 if (substring == NULL) {
11525 if (PyErr_ExceptionMatches(PyExc_TypeError))
11526 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11527 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011528 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011529 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011530 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011532 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533}
11534
11535
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011536PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011537 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011539Return True if S ends with the specified suffix, False otherwise.\n\
11540With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011541With optional end, stop comparing S at that position.\n\
11542suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543
11544static PyObject *
11545unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011548 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011550 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011551 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011552 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553
Jesus Ceaac451502011-04-20 17:09:23 +020011554 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011556 if (PyTuple_Check(subobj)) {
11557 Py_ssize_t i;
11558 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11559 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011560 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011561 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011563 result = tailmatch(self, substring, start, end, +1);
11564 Py_DECREF(substring);
11565 if (result) {
11566 Py_RETURN_TRUE;
11567 }
11568 }
11569 Py_RETURN_FALSE;
11570 }
11571 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011572 if (substring == NULL) {
11573 if (PyErr_ExceptionMatches(PyExc_TypeError))
11574 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11575 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011576 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011577 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011578 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011580 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581}
11582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011584
11585PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011587\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011588Return a formatted version of S, using substitutions from args and kwargs.\n\
11589The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011590
Eric Smith27bbca62010-11-04 17:06:58 +000011591PyDoc_STRVAR(format_map__doc__,
11592 "S.format_map(mapping) -> str\n\
11593\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011594Return a formatted version of S, using substitutions from mapping.\n\
11595The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011596
Eric Smith4a7d76d2008-05-30 18:10:19 +000011597static PyObject *
11598unicode__format__(PyObject* self, PyObject* args)
11599{
11600 PyObject *format_spec;
11601
11602 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11603 return NULL;
11604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11606 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011607}
11608
Eric Smith8c663262007-08-25 02:26:07 +000011609PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011610 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011611\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011612Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011613
11614static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011615unicode__sizeof__(PyUnicodeObject *v)
11616{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 Py_ssize_t size;
11618
11619 /* If it's a compact object, account for base structure +
11620 character data. */
11621 if (PyUnicode_IS_COMPACT_ASCII(v))
11622 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11623 else if (PyUnicode_IS_COMPACT(v))
11624 size = sizeof(PyCompactUnicodeObject) +
11625 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11626 else {
11627 /* If it is a two-block object, account for base object, and
11628 for character block if present. */
11629 size = sizeof(PyUnicodeObject);
11630 if (v->data.any)
11631 size += (PyUnicode_GET_LENGTH(v) + 1) *
11632 PyUnicode_CHARACTER_SIZE(v);
11633 }
11634 /* If the wstr pointer is present, account for it unless it is shared
11635 with the data pointer. Since PyUnicode_DATA will crash if the object
11636 is not ready, check whether it's either not ready (in which case the
11637 data is entirely in wstr) or if the data is not shared. */
11638 if (_PyUnicode_WSTR(v) &&
11639 (!PyUnicode_IS_READY(v) ||
11640 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11641 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11642 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11643 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11644
11645 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011646}
11647
11648PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011650
11651static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011652unicode_getnewargs(PyUnicodeObject *v)
11653{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 PyObject *copy;
11655 unsigned char *data;
11656 int kind;
11657 if (PyUnicode_READY(v) == -1)
11658 return NULL;
11659 kind = PyUnicode_KIND(v);
11660 data = PyUnicode_1BYTE_DATA(v);
11661 copy = PyUnicode_FromKindAndData(kind, data, PyUnicode_GET_LENGTH(v));
11662 if (!copy)
11663 return NULL;
11664 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011665}
11666
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667static PyMethodDef unicode_methods[] = {
11668
11669 /* Order is according to common usage: often used methods should
11670 appear first, since lookup is done sequentially. */
11671
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011672 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011673 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11674 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011675 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011676 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11677 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11678 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11679 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11680 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11681 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11682 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011683 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011684 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11685 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11686 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011687 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011688 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11689 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11690 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011691 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011692 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011693 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011694 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011695 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11696 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11697 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11698 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11699 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11700 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11701 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11702 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11703 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11704 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11705 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11706 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11707 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11708 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011709 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011710 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011711 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011712 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011713 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011714 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011715 {"maketrans", (PyCFunction) unicode_maketrans,
11716 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011717 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011718#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011719 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720#endif
11721
11722#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011723 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011724 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725#endif
11726
Benjamin Peterson14339b62009-01-31 16:36:08 +000011727 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728 {NULL, NULL}
11729};
11730
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011731static PyObject *
11732unicode_mod(PyObject *v, PyObject *w)
11733{
Brian Curtindfc80e32011-08-10 20:28:54 -050011734 if (!PyUnicode_Check(v))
11735 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011736 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011737}
11738
11739static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011740 0, /*nb_add*/
11741 0, /*nb_subtract*/
11742 0, /*nb_multiply*/
11743 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011744};
11745
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011747 (lenfunc) unicode_length, /* sq_length */
11748 PyUnicode_Concat, /* sq_concat */
11749 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11750 (ssizeargfunc) unicode_getitem, /* sq_item */
11751 0, /* sq_slice */
11752 0, /* sq_ass_item */
11753 0, /* sq_ass_slice */
11754 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755};
11756
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011757static PyObject*
11758unicode_subscript(PyUnicodeObject* self, PyObject* item)
11759{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 if (PyUnicode_READY(self) == -1)
11761 return NULL;
11762
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011763 if (PyIndex_Check(item)) {
11764 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011765 if (i == -1 && PyErr_Occurred())
11766 return NULL;
11767 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011769 return unicode_getitem(self, i);
11770 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011771 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011773 Py_UNICODE* result_buf;
11774 PyObject* result;
11775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011777 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011778 return NULL;
11779 }
11780
11781 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 return PyUnicode_New(0, 0);
11783 } else if (start == 0 && step == 1 &&
11784 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011785 PyUnicode_CheckExact(self)) {
11786 Py_INCREF(self);
11787 return (PyObject *)self;
11788 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011790 } else {
11791 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011792 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11793 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011794
Benjamin Peterson29060642009-01-31 22:14:21 +000011795 if (result_buf == NULL)
11796 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011797
11798 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11799 result_buf[i] = source_buf[cur];
11800 }
Tim Petersced69f82003-09-16 20:30:58 +000011801
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011802 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011803 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011804 return result;
11805 }
11806 } else {
11807 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11808 return NULL;
11809 }
11810}
11811
11812static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011813 (lenfunc)unicode_length, /* mp_length */
11814 (binaryfunc)unicode_subscript, /* mp_subscript */
11815 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011816};
11817
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819/* Helpers for PyUnicode_Format() */
11820
11821static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011822getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011824 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011826 (*p_argidx)++;
11827 if (arglen < 0)
11828 return args;
11829 else
11830 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831 }
11832 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834 return NULL;
11835}
11836
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011837/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011839static PyObject *
11840formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011842 char *p;
11843 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011845
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846 x = PyFloat_AsDouble(v);
11847 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011848 return NULL;
11849
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011851 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011852
Eric Smith0923d1d2009-04-16 20:16:10 +000011853 p = PyOS_double_to_string(x, type, prec,
11854 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011855 if (p == NULL)
11856 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011858 PyMem_Free(p);
11859 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860}
11861
Tim Peters38fd5b62000-09-21 05:43:11 +000011862static PyObject*
11863formatlong(PyObject *val, int flags, int prec, int type)
11864{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011865 char *buf;
11866 int len;
11867 PyObject *str; /* temporary string object. */
11868 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011869
Benjamin Peterson14339b62009-01-31 16:36:08 +000011870 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11871 if (!str)
11872 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011874 Py_DECREF(str);
11875 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011876}
11877
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011880 size_t buflen,
11881 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011883 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011884 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 if (PyUnicode_GET_LENGTH(v) == 1) {
11886 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011887 buf[1] = '\0';
11888 return 1;
11889 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011890 goto onError;
11891 }
11892 else {
11893 /* Integer input truncated to a character */
11894 long x;
11895 x = PyLong_AsLong(v);
11896 if (x == -1 && PyErr_Occurred())
11897 goto onError;
11898
11899 if (x < 0 || x > 0x10ffff) {
11900 PyErr_SetString(PyExc_OverflowError,
11901 "%c arg not in range(0x110000)");
11902 return -1;
11903 }
11904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011906 buf[1] = '\0';
11907 return 1;
11908 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011909
Benjamin Peterson29060642009-01-31 22:14:21 +000011910 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011911 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011912 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011913 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914}
11915
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011916/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011917 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011918*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011919#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011920
Alexander Belopolsky40018472011-02-26 01:02:56 +000011921PyObject *
11922PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011924 void *fmt;
11925 int fmtkind;
11926 PyObject *result;
11927 Py_UCS4 *res, *res0;
11928 Py_UCS4 max;
11929 int kind;
11930 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011934
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 PyErr_BadInternalCall();
11937 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11940 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 fmt = PyUnicode_DATA(uformat);
11943 fmtkind = PyUnicode_KIND(uformat);
11944 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11945 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946
11947 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11949 if (res0 == NULL) {
11950 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011951 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953
11954 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 arglen = PyTuple_Size(args);
11956 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957 }
11958 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 arglen = -1;
11960 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011962 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011963 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965
11966 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011968 if (--rescnt < 0) {
11969 rescnt = fmtcnt + 100;
11970 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11972 if (res0 == NULL){
11973 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011974 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 }
11976 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011978 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011980 }
11981 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011982 /* Got a format specifier */
11983 int flags = 0;
11984 Py_ssize_t width = -1;
11985 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 Py_UCS4 c = '\0';
11987 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011988 int isnumok;
11989 PyObject *v = NULL;
11990 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 void *pbuf;
11992 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011993 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 Py_ssize_t len, len1;
11995 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 fmtpos++;
11998 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
11999 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 Py_ssize_t keylen;
12001 PyObject *key;
12002 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012003
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 if (dict == NULL) {
12005 PyErr_SetString(PyExc_TypeError,
12006 "format requires a mapping");
12007 goto onError;
12008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012010 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012012 /* Skip over balanced parentheses */
12013 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012015 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012017 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012019 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 if (fmtcnt < 0 || pcount > 0) {
12022 PyErr_SetString(PyExc_ValueError,
12023 "incomplete format key");
12024 goto onError;
12025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012027 if (key == NULL)
12028 goto onError;
12029 if (args_owned) {
12030 Py_DECREF(args);
12031 args_owned = 0;
12032 }
12033 args = PyObject_GetItem(dict, key);
12034 Py_DECREF(key);
12035 if (args == NULL) {
12036 goto onError;
12037 }
12038 args_owned = 1;
12039 arglen = -1;
12040 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012041 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012042 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 case '-': flags |= F_LJUST; continue;
12045 case '+': flags |= F_SIGN; continue;
12046 case ' ': flags |= F_BLANK; continue;
12047 case '#': flags |= F_ALT; continue;
12048 case '0': flags |= F_ZERO; continue;
12049 }
12050 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012051 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012052 if (c == '*') {
12053 v = getnextarg(args, arglen, &argidx);
12054 if (v == NULL)
12055 goto onError;
12056 if (!PyLong_Check(v)) {
12057 PyErr_SetString(PyExc_TypeError,
12058 "* wants int");
12059 goto onError;
12060 }
12061 width = PyLong_AsLong(v);
12062 if (width == -1 && PyErr_Occurred())
12063 goto onError;
12064 if (width < 0) {
12065 flags |= F_LJUST;
12066 width = -width;
12067 }
12068 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012070 }
12071 else if (c >= '0' && c <= '9') {
12072 width = c - '0';
12073 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012075 if (c < '0' || c > '9')
12076 break;
12077 if ((width*10) / 10 != width) {
12078 PyErr_SetString(PyExc_ValueError,
12079 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012080 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012081 }
12082 width = width*10 + (c - '0');
12083 }
12084 }
12085 if (c == '.') {
12086 prec = 0;
12087 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012089 if (c == '*') {
12090 v = getnextarg(args, arglen, &argidx);
12091 if (v == NULL)
12092 goto onError;
12093 if (!PyLong_Check(v)) {
12094 PyErr_SetString(PyExc_TypeError,
12095 "* wants int");
12096 goto onError;
12097 }
12098 prec = PyLong_AsLong(v);
12099 if (prec == -1 && PyErr_Occurred())
12100 goto onError;
12101 if (prec < 0)
12102 prec = 0;
12103 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012105 }
12106 else if (c >= '0' && c <= '9') {
12107 prec = c - '0';
12108 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012110 if (c < '0' || c > '9')
12111 break;
12112 if ((prec*10) / 10 != prec) {
12113 PyErr_SetString(PyExc_ValueError,
12114 "prec too big");
12115 goto onError;
12116 }
12117 prec = prec*10 + (c - '0');
12118 }
12119 }
12120 } /* prec */
12121 if (fmtcnt >= 0) {
12122 if (c == 'h' || c == 'l' || c == 'L') {
12123 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012125 }
12126 }
12127 if (fmtcnt < 0) {
12128 PyErr_SetString(PyExc_ValueError,
12129 "incomplete format");
12130 goto onError;
12131 }
12132 if (c != '%') {
12133 v = getnextarg(args, arglen, &argidx);
12134 if (v == NULL)
12135 goto onError;
12136 }
12137 sign = 0;
12138 fill = ' ';
12139 switch (c) {
12140
12141 case '%':
12142 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012144 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012146 len = 1;
12147 break;
12148
12149 case 's':
12150 case 'r':
12151 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012152 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012153 temp = v;
12154 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012155 }
12156 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012157 if (c == 's')
12158 temp = PyObject_Str(v);
12159 else if (c == 'r')
12160 temp = PyObject_Repr(v);
12161 else
12162 temp = PyObject_ASCII(v);
12163 if (temp == NULL)
12164 goto onError;
12165 if (PyUnicode_Check(temp))
12166 /* nothing to do */;
12167 else {
12168 Py_DECREF(temp);
12169 PyErr_SetString(PyExc_TypeError,
12170 "%s argument has non-string str()");
12171 goto onError;
12172 }
12173 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 if (PyUnicode_READY(temp) == -1) {
12175 Py_CLEAR(temp);
12176 goto onError;
12177 }
12178 pbuf = PyUnicode_DATA(temp);
12179 kind = PyUnicode_KIND(temp);
12180 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012181 if (prec >= 0 && len > prec)
12182 len = prec;
12183 break;
12184
12185 case 'i':
12186 case 'd':
12187 case 'u':
12188 case 'o':
12189 case 'x':
12190 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012191 isnumok = 0;
12192 if (PyNumber_Check(v)) {
12193 PyObject *iobj=NULL;
12194
12195 if (PyLong_Check(v)) {
12196 iobj = v;
12197 Py_INCREF(iobj);
12198 }
12199 else {
12200 iobj = PyNumber_Long(v);
12201 }
12202 if (iobj!=NULL) {
12203 if (PyLong_Check(iobj)) {
12204 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012205 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012206 Py_DECREF(iobj);
12207 if (!temp)
12208 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 if (PyUnicode_READY(temp) == -1) {
12210 Py_CLEAR(temp);
12211 goto onError;
12212 }
12213 pbuf = PyUnicode_DATA(temp);
12214 kind = PyUnicode_KIND(temp);
12215 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012216 sign = 1;
12217 }
12218 else {
12219 Py_DECREF(iobj);
12220 }
12221 }
12222 }
12223 if (!isnumok) {
12224 PyErr_Format(PyExc_TypeError,
12225 "%%%c format: a number is required, "
12226 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12227 goto onError;
12228 }
12229 if (flags & F_ZERO)
12230 fill = '0';
12231 break;
12232
12233 case 'e':
12234 case 'E':
12235 case 'f':
12236 case 'F':
12237 case 'g':
12238 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012239 temp = formatfloat(v, flags, prec, c);
12240 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012241 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 if (PyUnicode_READY(temp) == -1) {
12243 Py_CLEAR(temp);
12244 goto onError;
12245 }
12246 pbuf = PyUnicode_DATA(temp);
12247 kind = PyUnicode_KIND(temp);
12248 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012249 sign = 1;
12250 if (flags & F_ZERO)
12251 fill = '0';
12252 break;
12253
12254 case 'c':
12255 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012257 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
12258 if (len < 0)
12259 goto onError;
12260 break;
12261
12262 default:
12263 PyErr_Format(PyExc_ValueError,
12264 "unsupported format character '%c' (0x%x) "
12265 "at index %zd",
12266 (31<=c && c<=126) ? (char)c : '?',
12267 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012269 goto onError;
12270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 /* pbuf is initialized here. */
12272 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012273 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12275 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12276 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012277 len--;
12278 }
12279 else if (flags & F_SIGN)
12280 sign = '+';
12281 else if (flags & F_BLANK)
12282 sign = ' ';
12283 else
12284 sign = 0;
12285 }
12286 if (width < len)
12287 width = len;
12288 if (rescnt - (sign != 0) < width) {
12289 reslen -= rescnt;
12290 rescnt = width + fmtcnt + 100;
12291 reslen += rescnt;
12292 if (reslen < 0) {
12293 Py_XDECREF(temp);
12294 PyErr_NoMemory();
12295 goto onError;
12296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12298 if (res0 == 0) {
12299 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 Py_XDECREF(temp);
12301 goto onError;
12302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 }
12305 if (sign) {
12306 if (fill != ' ')
12307 *res++ = sign;
12308 rescnt--;
12309 if (width > len)
12310 width--;
12311 }
12312 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12314 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012315 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12317 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012318 }
12319 rescnt -= 2;
12320 width -= 2;
12321 if (width < 0)
12322 width = 0;
12323 len -= 2;
12324 }
12325 if (width > len && !(flags & F_LJUST)) {
12326 do {
12327 --rescnt;
12328 *res++ = fill;
12329 } while (--width > len);
12330 }
12331 if (fill == ' ') {
12332 if (sign)
12333 *res++ = sign;
12334 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12336 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12337 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12338 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012339 }
12340 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 /* Copy all characters, preserving len */
12342 len1 = len;
12343 while (len1--) {
12344 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12345 rescnt--;
12346 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012347 while (--width >= len) {
12348 --rescnt;
12349 *res++ = ' ';
12350 }
12351 if (dict && (argidx < arglen) && c != '%') {
12352 PyErr_SetString(PyExc_TypeError,
12353 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012354 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012355 goto onError;
12356 }
12357 Py_XDECREF(temp);
12358 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359 } /* until end */
12360 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012361 PyErr_SetString(PyExc_TypeError,
12362 "not all arguments converted during string formatting");
12363 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364 }
12365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366
12367 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12368 if (*res > max)
12369 max = *res;
12370 result = PyUnicode_New(reslen - rescnt, max);
12371 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012372 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 kind = PyUnicode_KIND(result);
12374 for (res = res0; res < res0+reslen-rescnt; res++)
12375 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12376 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012378 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379 }
12380 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381 return (PyObject *)result;
12382
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385 Py_DECREF(uformat);
12386 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012387 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388 }
12389 return NULL;
12390}
12391
Jeremy Hylton938ace62002-07-17 16:30:39 +000012392static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012393unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12394
Tim Peters6d6c1a32001-08-02 04:15:00 +000012395static PyObject *
12396unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12397{
Benjamin Peterson29060642009-01-31 22:14:21 +000012398 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012399 static char *kwlist[] = {"object", "encoding", "errors", 0};
12400 char *encoding = NULL;
12401 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012402
Benjamin Peterson14339b62009-01-31 16:36:08 +000012403 if (type != &PyUnicode_Type)
12404 return unicode_subtype_new(type, args, kwds);
12405 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012406 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012407 return NULL;
12408 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012410 if (encoding == NULL && errors == NULL)
12411 return PyObject_Str(x);
12412 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012413 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012414}
12415
Guido van Rossume023fe02001-08-30 03:12:59 +000012416static PyObject *
12417unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12418{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012419 PyUnicodeObject *tmp, *pnew;
12420 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012422
Benjamin Peterson14339b62009-01-31 16:36:08 +000012423 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12424 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12425 if (tmp == NULL)
12426 return NULL;
12427 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12429 // it seems kind of strange that tp_alloc gets passed the size
12430 // of the unicode string because there will follow another
12431 // malloc.
12432 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12433 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012434 if (pnew == NULL) {
12435 Py_DECREF(tmp);
12436 return NULL;
12437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12439 if (_PyUnicode_WSTR(pnew) == NULL) {
12440 err = PyErr_NoMemory();
12441 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12444 _PyUnicode_WSTR_LENGTH(pnew) = n;
12445 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12446 _PyUnicode_STATE(pnew).interned = 0;
12447 _PyUnicode_STATE(pnew).kind = 0;
12448 _PyUnicode_STATE(pnew).compact = 0;
12449 _PyUnicode_STATE(pnew).ready = 0;
12450 _PyUnicode_STATE(pnew).ascii = 0;
12451 pnew->data.any = NULL;
12452 _PyUnicode_LENGTH(pnew) = 0;
12453 pnew->_base.utf8 = NULL;
12454 pnew->_base.utf8_length = 0;
12455
12456 if (PyUnicode_READY(pnew) == -1) {
12457 PyObject_FREE(_PyUnicode_WSTR(pnew));
12458 goto onError;
12459 }
12460
Benjamin Peterson14339b62009-01-31 16:36:08 +000012461 Py_DECREF(tmp);
12462 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463
12464 onError:
12465 _Py_ForgetReference((PyObject *)pnew);
12466 PyObject_Del(pnew);
12467 Py_DECREF(tmp);
12468 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012469}
12470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012471PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012472 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012473\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012474Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012475encoding defaults to the current default string encoding.\n\
12476errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012477
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012478static PyObject *unicode_iter(PyObject *seq);
12479
Guido van Rossumd57fd912000-03-10 22:53:23 +000012480PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012481 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012482 "str", /* tp_name */
12483 sizeof(PyUnicodeObject), /* tp_size */
12484 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012486 (destructor)unicode_dealloc, /* tp_dealloc */
12487 0, /* tp_print */
12488 0, /* tp_getattr */
12489 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012490 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012491 unicode_repr, /* tp_repr */
12492 &unicode_as_number, /* tp_as_number */
12493 &unicode_as_sequence, /* tp_as_sequence */
12494 &unicode_as_mapping, /* tp_as_mapping */
12495 (hashfunc) unicode_hash, /* tp_hash*/
12496 0, /* tp_call*/
12497 (reprfunc) unicode_str, /* tp_str */
12498 PyObject_GenericGetAttr, /* tp_getattro */
12499 0, /* tp_setattro */
12500 0, /* tp_as_buffer */
12501 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012502 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012503 unicode_doc, /* tp_doc */
12504 0, /* tp_traverse */
12505 0, /* tp_clear */
12506 PyUnicode_RichCompare, /* tp_richcompare */
12507 0, /* tp_weaklistoffset */
12508 unicode_iter, /* tp_iter */
12509 0, /* tp_iternext */
12510 unicode_methods, /* tp_methods */
12511 0, /* tp_members */
12512 0, /* tp_getset */
12513 &PyBaseObject_Type, /* tp_base */
12514 0, /* tp_dict */
12515 0, /* tp_descr_get */
12516 0, /* tp_descr_set */
12517 0, /* tp_dictoffset */
12518 0, /* tp_init */
12519 0, /* tp_alloc */
12520 unicode_new, /* tp_new */
12521 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522};
12523
12524/* Initialize the Unicode implementation */
12525
Thomas Wouters78890102000-07-22 19:25:51 +000012526void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012528 int i;
12529
Thomas Wouters477c8d52006-05-27 19:21:47 +000012530 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012532 0x000A, /* LINE FEED */
12533 0x000D, /* CARRIAGE RETURN */
12534 0x001C, /* FILE SEPARATOR */
12535 0x001D, /* GROUP SEPARATOR */
12536 0x001E, /* RECORD SEPARATOR */
12537 0x0085, /* NEXT LINE */
12538 0x2028, /* LINE SEPARATOR */
12539 0x2029, /* PARAGRAPH SEPARATOR */
12540 };
12541
Fred Drakee4315f52000-05-09 19:53:39 +000012542 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012544 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012545 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012546
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012547 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012548 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012549 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012550 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012551
12552 /* initialize the linebreak bloom filter */
12553 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 PyUnicode_2BYTE_KIND, linebreak,
12555 sizeof(linebreak) / sizeof(linebreak[0]));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012556
12557 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558}
12559
12560/* Finalize the Unicode implementation */
12561
Christian Heimesa156e092008-02-16 07:38:31 +000012562int
12563PyUnicode_ClearFreeList(void)
12564{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012566}
12567
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568void
Thomas Wouters78890102000-07-22 19:25:51 +000012569_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012571 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012573 Py_XDECREF(unicode_empty);
12574 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012575
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012576 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012577 if (unicode_latin1[i]) {
12578 Py_DECREF(unicode_latin1[i]);
12579 unicode_latin1[i] = NULL;
12580 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012581 }
Christian Heimesa156e092008-02-16 07:38:31 +000012582 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012584
Walter Dörwald16807132007-05-25 13:52:07 +000012585void
12586PyUnicode_InternInPlace(PyObject **p)
12587{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012588 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12589 PyObject *t;
12590 if (s == NULL || !PyUnicode_Check(s))
12591 Py_FatalError(
12592 "PyUnicode_InternInPlace: unicode strings only please!");
12593 /* If it's a subclass, we don't really know what putting
12594 it in the interned dict might do. */
12595 if (!PyUnicode_CheckExact(s))
12596 return;
12597 if (PyUnicode_CHECK_INTERNED(s))
12598 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599 if (PyUnicode_READY(s) == -1) {
12600 assert(0 && "ready fail in intern...");
12601 return;
12602 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012603 if (interned == NULL) {
12604 interned = PyDict_New();
12605 if (interned == NULL) {
12606 PyErr_Clear(); /* Don't leave an exception */
12607 return;
12608 }
12609 }
12610 /* It might be that the GetItem call fails even
12611 though the key is present in the dictionary,
12612 namely when this happens during a stack overflow. */
12613 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012614 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012615 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012616
Benjamin Peterson29060642009-01-31 22:14:21 +000012617 if (t) {
12618 Py_INCREF(t);
12619 Py_DECREF(*p);
12620 *p = t;
12621 return;
12622 }
Walter Dörwald16807132007-05-25 13:52:07 +000012623
Benjamin Peterson14339b62009-01-31 16:36:08 +000012624 PyThreadState_GET()->recursion_critical = 1;
12625 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12626 PyErr_Clear();
12627 PyThreadState_GET()->recursion_critical = 0;
12628 return;
12629 }
12630 PyThreadState_GET()->recursion_critical = 0;
12631 /* The two references in interned are not counted by refcnt.
12632 The deallocator will take care of this */
12633 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012635}
12636
12637void
12638PyUnicode_InternImmortal(PyObject **p)
12639{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12641
Benjamin Peterson14339b62009-01-31 16:36:08 +000012642 PyUnicode_InternInPlace(p);
12643 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012645 Py_INCREF(*p);
12646 }
Walter Dörwald16807132007-05-25 13:52:07 +000012647}
12648
12649PyObject *
12650PyUnicode_InternFromString(const char *cp)
12651{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012652 PyObject *s = PyUnicode_FromString(cp);
12653 if (s == NULL)
12654 return NULL;
12655 PyUnicode_InternInPlace(&s);
12656 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012657}
12658
Alexander Belopolsky40018472011-02-26 01:02:56 +000012659void
12660_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012661{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012662 PyObject *keys;
12663 PyUnicodeObject *s;
12664 Py_ssize_t i, n;
12665 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012666
Benjamin Peterson14339b62009-01-31 16:36:08 +000012667 if (interned == NULL || !PyDict_Check(interned))
12668 return;
12669 keys = PyDict_Keys(interned);
12670 if (keys == NULL || !PyList_Check(keys)) {
12671 PyErr_Clear();
12672 return;
12673 }
Walter Dörwald16807132007-05-25 13:52:07 +000012674
Benjamin Peterson14339b62009-01-31 16:36:08 +000012675 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12676 detector, interned unicode strings are not forcibly deallocated;
12677 rather, we give them their stolen references back, and then clear
12678 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012679
Benjamin Peterson14339b62009-01-31 16:36:08 +000012680 n = PyList_GET_SIZE(keys);
12681 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012682 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012683 for (i = 0; i < n; i++) {
12684 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 if (PyUnicode_READY(s) == -1)
12686 fprintf(stderr, "could not ready string\n");
12687 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012688 case SSTATE_NOT_INTERNED:
12689 /* XXX Shouldn't happen */
12690 break;
12691 case SSTATE_INTERNED_IMMORTAL:
12692 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012694 break;
12695 case SSTATE_INTERNED_MORTAL:
12696 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012698 break;
12699 default:
12700 Py_FatalError("Inconsistent interned string state.");
12701 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012703 }
12704 fprintf(stderr, "total size of all interned strings: "
12705 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12706 "mortal/immortal\n", mortal_size, immortal_size);
12707 Py_DECREF(keys);
12708 PyDict_Clear(interned);
12709 Py_DECREF(interned);
12710 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012711}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012712
12713
12714/********************* Unicode Iterator **************************/
12715
12716typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012717 PyObject_HEAD
12718 Py_ssize_t it_index;
12719 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012720} unicodeiterobject;
12721
12722static void
12723unicodeiter_dealloc(unicodeiterobject *it)
12724{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012725 _PyObject_GC_UNTRACK(it);
12726 Py_XDECREF(it->it_seq);
12727 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012728}
12729
12730static int
12731unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12732{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012733 Py_VISIT(it->it_seq);
12734 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012735}
12736
12737static PyObject *
12738unicodeiter_next(unicodeiterobject *it)
12739{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012740 PyUnicodeObject *seq;
12741 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012742
Benjamin Peterson14339b62009-01-31 16:36:08 +000012743 assert(it != NULL);
12744 seq = it->it_seq;
12745 if (seq == NULL)
12746 return NULL;
12747 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12750 int kind = PyUnicode_KIND(seq);
12751 void *data = PyUnicode_DATA(seq);
12752 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12753 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012754 if (item != NULL)
12755 ++it->it_index;
12756 return item;
12757 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012758
Benjamin Peterson14339b62009-01-31 16:36:08 +000012759 Py_DECREF(seq);
12760 it->it_seq = NULL;
12761 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012762}
12763
12764static PyObject *
12765unicodeiter_len(unicodeiterobject *it)
12766{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012767 Py_ssize_t len = 0;
12768 if (it->it_seq)
12769 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12770 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012771}
12772
12773PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12774
12775static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012776 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012777 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012778 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012779};
12780
12781PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012782 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12783 "str_iterator", /* tp_name */
12784 sizeof(unicodeiterobject), /* tp_basicsize */
12785 0, /* tp_itemsize */
12786 /* methods */
12787 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12788 0, /* tp_print */
12789 0, /* tp_getattr */
12790 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012791 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012792 0, /* tp_repr */
12793 0, /* tp_as_number */
12794 0, /* tp_as_sequence */
12795 0, /* tp_as_mapping */
12796 0, /* tp_hash */
12797 0, /* tp_call */
12798 0, /* tp_str */
12799 PyObject_GenericGetAttr, /* tp_getattro */
12800 0, /* tp_setattro */
12801 0, /* tp_as_buffer */
12802 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12803 0, /* tp_doc */
12804 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12805 0, /* tp_clear */
12806 0, /* tp_richcompare */
12807 0, /* tp_weaklistoffset */
12808 PyObject_SelfIter, /* tp_iter */
12809 (iternextfunc)unicodeiter_next, /* tp_iternext */
12810 unicodeiter_methods, /* tp_methods */
12811 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012812};
12813
12814static PyObject *
12815unicode_iter(PyObject *seq)
12816{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012817 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012818
Benjamin Peterson14339b62009-01-31 16:36:08 +000012819 if (!PyUnicode_Check(seq)) {
12820 PyErr_BadInternalCall();
12821 return NULL;
12822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823 if (PyUnicode_READY(seq) == -1)
12824 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012825 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12826 if (it == NULL)
12827 return NULL;
12828 it->it_index = 0;
12829 Py_INCREF(seq);
12830 it->it_seq = (PyUnicodeObject *)seq;
12831 _PyObject_GC_TRACK(it);
12832 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012833}
12834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835#define UNIOP(x) Py_UNICODE_##x
12836#define UNIOP_t Py_UNICODE
12837#include "uniops.h"
12838#undef UNIOP
12839#undef UNIOP_t
12840#define UNIOP(x) Py_UCS4_##x
12841#define UNIOP_t Py_UCS4
12842#include "uniops.h"
12843#undef UNIOP
12844#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012845
Victor Stinner71133ff2010-09-01 23:43:53 +000012846Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012847PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012848{
12849 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12850 Py_UNICODE *copy;
12851 Py_ssize_t size;
12852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012853 if (!PyUnicode_Check(unicode)) {
12854 PyErr_BadArgument();
12855 return NULL;
12856 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012857 /* Ensure we won't overflow the size. */
12858 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12859 PyErr_NoMemory();
12860 return NULL;
12861 }
12862 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12863 size *= sizeof(Py_UNICODE);
12864 copy = PyMem_Malloc(size);
12865 if (copy == NULL) {
12866 PyErr_NoMemory();
12867 return NULL;
12868 }
12869 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12870 return copy;
12871}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012872
Georg Brandl66c221e2010-10-14 07:04:07 +000012873/* A _string module, to export formatter_parser and formatter_field_name_split
12874 to the string.Formatter class implemented in Python. */
12875
12876static PyMethodDef _string_methods[] = {
12877 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12878 METH_O, PyDoc_STR("split the argument as a field name")},
12879 {"formatter_parser", (PyCFunction) formatter_parser,
12880 METH_O, PyDoc_STR("parse the argument as a format string")},
12881 {NULL, NULL}
12882};
12883
12884static struct PyModuleDef _string_module = {
12885 PyModuleDef_HEAD_INIT,
12886 "_string",
12887 PyDoc_STR("string helper module"),
12888 0,
12889 _string_methods,
12890 NULL,
12891 NULL,
12892 NULL,
12893 NULL
12894};
12895
12896PyMODINIT_FUNC
12897PyInit__string(void)
12898{
12899 return PyModule_Create(&_string_module);
12900}
12901
12902
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012903#ifdef __cplusplus
12904}
12905#endif