blob: 0f5bdfcb3666f0332a0c074b7e16b948a099d99d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
52 The globals are initialized by the _PyUnicode_Init() API and should
53 not be used before calling that API.
54
55*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000057
58#ifdef __cplusplus
59extern "C" {
60#endif
61
Victor Stinner8faf8212011-12-08 22:14:11 +010062/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
63#define MAX_UNICODE 0x10ffff
64
Victor Stinner910337b2011-10-03 03:20:16 +020065#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020066# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020067#else
68# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
69#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020070
Victor Stinnere90fe6a2011-10-01 16:48:13 +020071#define _PyUnicode_UTF8(op) \
72 (((PyCompactUnicodeObject*)(op))->utf8)
73#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020074 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020075 assert(PyUnicode_IS_READY(op)), \
76 PyUnicode_IS_COMPACT_ASCII(op) ? \
77 ((char*)((PyASCIIObject*)(op) + 1)) : \
78 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020079#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080 (((PyCompactUnicodeObject*)(op))->utf8_length)
81#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((PyASCIIObject*)(op))->length : \
86 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020087#define _PyUnicode_WSTR(op) \
88 (((PyASCIIObject*)(op))->wstr)
89#define _PyUnicode_WSTR_LENGTH(op) \
90 (((PyCompactUnicodeObject*)(op))->wstr_length)
91#define _PyUnicode_LENGTH(op) \
92 (((PyASCIIObject *)(op))->length)
93#define _PyUnicode_STATE(op) \
94 (((PyASCIIObject *)(op))->state)
95#define _PyUnicode_HASH(op) \
96 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020097#define _PyUnicode_KIND(op) \
98 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020099 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#define _PyUnicode_GET_LENGTH(op) \
101 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200102 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200103#define _PyUnicode_DATA_ANY(op) \
104 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105
Victor Stinnere6abb482012-05-02 01:15:40 +0200106/* Optimized version of Py_MAX() to compute the maximum character:
107 use it when your are computing the second argument of PyUnicode_New() */
108#define MAX_MAXCHAR(maxchar1, maxchar2) \
109 ((maxchar1) | (maxchar2))
110
Victor Stinner910337b2011-10-03 03:20:16 +0200111#undef PyUnicode_READY
112#define PyUnicode_READY(op) \
113 (assert(_PyUnicode_CHECK(op)), \
114 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200115 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100116 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200117
Victor Stinnerc379ead2011-10-03 12:52:27 +0200118#define _PyUnicode_SHARE_UTF8(op) \
119 (assert(_PyUnicode_CHECK(op)), \
120 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
121 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
122#define _PyUnicode_SHARE_WSTR(op) \
123 (assert(_PyUnicode_CHECK(op)), \
124 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
125
Victor Stinner829c0ad2011-10-03 01:08:02 +0200126/* true if the Unicode object has an allocated UTF-8 memory block
127 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_HAS_UTF8_MEMORY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (!PyUnicode_IS_COMPACT_ASCII(op) \
131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (_PyUnicode_WSTR(op) && \
139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200149 to_type *_to = (to_type *) to; \
150 const from_type *_iter = (begin); \
151 const from_type *_end = (end); \
152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Walter Dörwald16807132007-05-25 13:52:07 +0000166/* This dictionary holds all interned unicode strings. Note that references
167 to strings in this dictionary are *not* counted in the string's ob_refcnt.
168 When the interned string reaches a refcnt of 0 the string deallocation
169 function will delete the reference from this dictionary.
170
171 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000172 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000173*/
174static PyObject *interned;
175
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000176/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200177static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200179/* List of static strings. */
180static _Py_Identifier *static_strings;
181
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000182/* Single character Unicode strings in the Latin-1 range are being
183 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200184static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185
Christian Heimes190d79e2008-01-30 11:58:22 +0000186/* Fast detection of the most frequent whitespace characters */
187const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000188 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000190/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000C: * FORM FEED */
193/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 1, 1, 1, 1, 1, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* case 0x001C: * FILE SEPARATOR */
197/* case 0x001D: * GROUP SEPARATOR */
198/* case 0x001E: * RECORD SEPARATOR */
199/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 1, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000206
Benjamin Peterson14339b62009-01-31 16:36:08 +0000207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000215};
216
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200217/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200218static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100220static int unicode_modifiable(PyObject *unicode);
221
Victor Stinnerfe226c02011-10-03 03:52:20 +0200222
Alexander Belopolsky40018472011-02-26 01:02:56 +0000223static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200224_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
225static PyObject *
226_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
227static PyObject *
228_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
229
230static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000232 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100233 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000234 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
235
Alexander Belopolsky40018472011-02-26 01:02:56 +0000236static void
237raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300238 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100239 PyObject *unicode,
240 Py_ssize_t startpos, Py_ssize_t endpos,
241 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000242
Christian Heimes190d79e2008-01-30 11:58:22 +0000243/* Same for linebreaks */
244static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000247/* 0x000B, * LINE TABULATION */
248/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000249/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000250 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000252/* 0x001C, * FILE SEPARATOR */
253/* 0x001D, * GROUP SEPARATOR */
254/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000255 0, 0, 0, 0, 1, 1, 1, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000260
Benjamin Peterson14339b62009-01-31 16:36:08 +0000261 0, 0, 0, 0, 0, 0, 0, 0,
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000269};
270
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300271/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
272 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000273Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000274PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000275{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000276#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000278#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 /* This is actually an illegal character, so it should
280 not be passed to unichr. */
281 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282#endif
283}
284
Victor Stinner910337b2011-10-03 03:20:16 +0200285#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200286int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100287_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200288{
289 PyASCIIObject *ascii;
290 unsigned int kind;
291
292 assert(PyUnicode_Check(op));
293
294 ascii = (PyASCIIObject *)op;
295 kind = ascii->state.kind;
296
Victor Stinnera3b334d2011-10-03 13:53:37 +0200297 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200298 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200299 assert(ascii->state.ready == 1);
300 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200301 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200302 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200303 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200304
Victor Stinnera41463c2011-10-04 01:05:08 +0200305 if (ascii->state.compact == 1) {
306 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND
308 || kind == PyUnicode_2BYTE_KIND
309 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200311 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200312 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100313 }
314 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
316
317 data = unicode->data.any;
318 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100319 assert(ascii->length == 0);
320 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert(ascii->state.compact == 0);
322 assert(ascii->state.ascii == 0);
323 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100324 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 assert(ascii->wstr != NULL);
326 assert(data == NULL);
327 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 }
329 else {
330 assert(kind == PyUnicode_1BYTE_KIND
331 || kind == PyUnicode_2BYTE_KIND
332 || kind == PyUnicode_4BYTE_KIND);
333 assert(ascii->state.compact == 0);
334 assert(ascii->state.ready == 1);
335 assert(data != NULL);
336 if (ascii->state.ascii) {
337 assert (compact->utf8 == data);
338 assert (compact->utf8_length == ascii->length);
339 }
340 else
341 assert (compact->utf8 != data);
342 }
343 }
344 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 if (
346#if SIZEOF_WCHAR_T == 2
347 kind == PyUnicode_2BYTE_KIND
348#else
349 kind == PyUnicode_4BYTE_KIND
350#endif
351 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 {
353 assert(ascii->wstr == data);
354 assert(compact->wstr_length == ascii->length);
355 } else
356 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200357 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200358
359 if (compact->utf8 == NULL)
360 assert(compact->utf8_length == 0);
361 if (ascii->wstr == NULL)
362 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200364 /* check that the best kind is used */
365 if (check_content && kind != PyUnicode_WCHAR_KIND)
366 {
367 Py_ssize_t i;
368 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200369 void *data;
370 Py_UCS4 ch;
371
372 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 for (i=0; i < ascii->length; i++)
374 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200375 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200376 if (ch > maxchar)
377 maxchar = ch;
378 }
379 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100380 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100382 assert(maxchar <= 255);
383 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 else
385 assert(maxchar < 128);
386 }
Victor Stinner77faf692011-11-20 18:56:05 +0100387 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100389 assert(maxchar <= 0xFFFF);
390 }
391 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100393 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100394 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200395 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400397 return 1;
398}
Victor Stinner910337b2011-10-03 03:20:16 +0200399#endif
400
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100401static PyObject*
402unicode_result_wchar(PyObject *unicode)
403{
404#ifndef Py_DEBUG
405 Py_ssize_t len;
406
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100407 len = _PyUnicode_WSTR_LENGTH(unicode);
408 if (len == 0) {
409 Py_INCREF(unicode_empty);
410 Py_DECREF(unicode);
411 return unicode_empty;
412 }
413
414 if (len == 1) {
415 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
416 if (ch < 256) {
417 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
418 Py_DECREF(unicode);
419 return latin1_char;
420 }
421 }
422
423 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200424 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425 return NULL;
426 }
427#else
Victor Stinneraa771272012-10-04 02:32:58 +0200428 assert(Py_REFCNT(unicode) == 1);
429
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100430 /* don't make the result ready in debug mode to ensure that the caller
431 makes the string ready before using it */
432 assert(_PyUnicode_CheckConsistency(unicode, 1));
433#endif
434 return unicode;
435}
436
437static PyObject*
438unicode_result_ready(PyObject *unicode)
439{
440 Py_ssize_t length;
441
442 length = PyUnicode_GET_LENGTH(unicode);
443 if (length == 0) {
444 if (unicode != unicode_empty) {
445 Py_INCREF(unicode_empty);
446 Py_DECREF(unicode);
447 }
448 return unicode_empty;
449 }
450
451 if (length == 1) {
452 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
453 if (ch < 256) {
454 PyObject *latin1_char = unicode_latin1[ch];
455 if (latin1_char != NULL) {
456 if (unicode != latin1_char) {
457 Py_INCREF(latin1_char);
458 Py_DECREF(unicode);
459 }
460 return latin1_char;
461 }
462 else {
463 assert(_PyUnicode_CheckConsistency(unicode, 1));
464 Py_INCREF(unicode);
465 unicode_latin1[ch] = unicode;
466 return unicode;
467 }
468 }
469 }
470
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 return unicode;
473}
474
475static PyObject*
476unicode_result(PyObject *unicode)
477{
478 assert(_PyUnicode_CHECK(unicode));
479 if (PyUnicode_IS_READY(unicode))
480 return unicode_result_ready(unicode);
481 else
482 return unicode_result_wchar(unicode);
483}
484
Victor Stinnerc4b49542011-12-11 22:44:26 +0100485static PyObject*
486unicode_result_unchanged(PyObject *unicode)
487{
488 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500489 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490 return NULL;
491 Py_INCREF(unicode);
492 return unicode;
493 }
494 else
495 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100496 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100497}
498
Victor Stinner3a50e702011-10-18 21:21:00 +0200499#ifdef HAVE_MBCS
500static OSVERSIONINFOEX winver;
501#endif
502
Thomas Wouters477c8d52006-05-27 19:21:47 +0000503/* --- Bloom Filters ----------------------------------------------------- */
504
505/* stuff to implement simple "bloom filters" for Unicode characters.
506 to keep things simple, we use a single bitmask, using the least 5
507 bits from each unicode characters as the bit index. */
508
509/* the linebreak mask is set up by Unicode_Init below */
510
Antoine Pitrouf068f942010-01-13 14:19:12 +0000511#if LONG_BIT >= 128
512#define BLOOM_WIDTH 128
513#elif LONG_BIT >= 64
514#define BLOOM_WIDTH 64
515#elif LONG_BIT >= 32
516#define BLOOM_WIDTH 32
517#else
518#error "LONG_BIT is smaller than 32"
519#endif
520
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521#define BLOOM_MASK unsigned long
522
523static BLOOM_MASK bloom_linebreak;
524
Antoine Pitrouf068f942010-01-13 14:19:12 +0000525#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
526#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000527
Benjamin Peterson29060642009-01-31 22:14:21 +0000528#define BLOOM_LINEBREAK(ch) \
529 ((ch) < 128U ? ascii_linebreak[(ch)] : \
530 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000531
Alexander Belopolsky40018472011-02-26 01:02:56 +0000532Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534{
535 /* calculate simple bloom-style bitmask for a given unicode string */
536
Antoine Pitrouf068f942010-01-13 14:19:12 +0000537 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538 Py_ssize_t i;
539
540 mask = 0;
541 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543
544 return mask;
545}
546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547#define BLOOM_MEMBER(mask, chr, str) \
548 (BLOOM(mask, chr) \
549 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000550
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200551/* Compilation of templated routines */
552
553#include "stringlib/asciilib.h"
554#include "stringlib/fastsearch.h"
555#include "stringlib/partition.h"
556#include "stringlib/split.h"
557#include "stringlib/count.h"
558#include "stringlib/find.h"
559#include "stringlib/find_max_char.h"
560#include "stringlib/localeutil.h"
561#include "stringlib/undef.h"
562
563#include "stringlib/ucs1lib.h"
564#include "stringlib/fastsearch.h"
565#include "stringlib/partition.h"
566#include "stringlib/split.h"
567#include "stringlib/count.h"
568#include "stringlib/find.h"
569#include "stringlib/find_max_char.h"
570#include "stringlib/localeutil.h"
571#include "stringlib/undef.h"
572
573#include "stringlib/ucs2lib.h"
574#include "stringlib/fastsearch.h"
575#include "stringlib/partition.h"
576#include "stringlib/split.h"
577#include "stringlib/count.h"
578#include "stringlib/find.h"
579#include "stringlib/find_max_char.h"
580#include "stringlib/localeutil.h"
581#include "stringlib/undef.h"
582
583#include "stringlib/ucs4lib.h"
584#include "stringlib/fastsearch.h"
585#include "stringlib/partition.h"
586#include "stringlib/split.h"
587#include "stringlib/count.h"
588#include "stringlib/find.h"
589#include "stringlib/find_max_char.h"
590#include "stringlib/localeutil.h"
591#include "stringlib/undef.h"
592
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200593#include "stringlib/unicodedefs.h"
594#include "stringlib/fastsearch.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100597#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599/* --- Unicode Object ----------------------------------------------------- */
600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200601static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200602fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200603
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200604Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
605 Py_ssize_t size, Py_UCS4 ch,
606 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200608 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
609
610 switch (kind) {
611 case PyUnicode_1BYTE_KIND:
612 {
613 Py_UCS1 ch1 = (Py_UCS1) ch;
614 if (ch1 == ch)
615 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
616 else
617 return -1;
618 }
619 case PyUnicode_2BYTE_KIND:
620 {
621 Py_UCS2 ch2 = (Py_UCS2) ch;
622 if (ch2 == ch)
623 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_4BYTE_KIND:
628 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
629 default:
630 assert(0);
631 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633}
634
Victor Stinnerafffce42012-10-03 23:03:17 +0200635#ifdef Py_DEBUG
636/* Fill the data of an Unicode string with invalid characters to detect bugs
637 earlier.
638
639 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
640 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
641 invalid character in Unicode 6.0. */
642static void
643unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
644{
645 int kind = PyUnicode_KIND(unicode);
646 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
647 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
648 if (length <= old_length)
649 return;
650 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
651}
652#endif
653
Victor Stinnerfe226c02011-10-03 03:52:20 +0200654static PyObject*
655resize_compact(PyObject *unicode, Py_ssize_t length)
656{
657 Py_ssize_t char_size;
658 Py_ssize_t struct_size;
659 Py_ssize_t new_size;
660 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100661 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200662#ifdef Py_DEBUG
663 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
664#endif
665
Victor Stinner79891572012-05-03 13:43:07 +0200666 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100668 assert(PyUnicode_IS_COMPACT(unicode));
669
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200670 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100671 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200672 struct_size = sizeof(PyASCIIObject);
673 else
674 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200675 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
678 PyErr_NoMemory();
679 return NULL;
680 }
681 new_size = (struct_size + (length + 1) * char_size);
682
Victor Stinner84def372011-12-11 20:04:56 +0100683 _Py_DEC_REFTOTAL;
684 _Py_ForgetReference(unicode);
685
686 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
687 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100688 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200689 PyErr_NoMemory();
690 return NULL;
691 }
Victor Stinner84def372011-12-11 20:04:56 +0100692 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200696 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100698 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200699 _PyUnicode_WSTR_LENGTH(unicode) = length;
700 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200701#ifdef Py_DEBUG
702 unicode_fill_invalid(unicode, old_length);
703#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
705 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200706 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200707 return unicode;
708}
709
Alexander Belopolsky40018472011-02-26 01:02:56 +0000710static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200711resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712{
Victor Stinner95663112011-10-04 01:03:50 +0200713 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100714 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 if (PyUnicode_IS_READY(unicode)) {
719 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200720 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200722#ifdef Py_DEBUG
723 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
724#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725
726 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200727 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200728 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
729 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730
731 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
732 PyErr_NoMemory();
733 return -1;
734 }
735 new_size = (length + 1) * char_size;
736
Victor Stinner7a9105a2011-12-12 00:13:42 +0100737 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
738 {
739 PyObject_DEL(_PyUnicode_UTF8(unicode));
740 _PyUnicode_UTF8(unicode) = NULL;
741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
742 }
743
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744 data = (PyObject *)PyObject_REALLOC(data, new_size);
745 if (data == NULL) {
746 PyErr_NoMemory();
747 return -1;
748 }
749 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200750 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200751 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200752 _PyUnicode_WSTR_LENGTH(unicode) = length;
753 }
754 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200755 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200756 _PyUnicode_UTF8_LENGTH(unicode) = length;
757 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200758 _PyUnicode_LENGTH(unicode) = length;
759 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200760#ifdef Py_DEBUG
761 unicode_fill_invalid(unicode, old_length);
762#endif
Victor Stinner95663112011-10-04 01:03:50 +0200763 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200764 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 }
Victor Stinner95663112011-10-04 01:03:50 +0200768 assert(_PyUnicode_WSTR(unicode) != NULL);
769
770 /* check for integer overflow */
771 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
772 PyErr_NoMemory();
773 return -1;
774 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100775 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200776 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100777 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200778 if (!wstr) {
779 PyErr_NoMemory();
780 return -1;
781 }
782 _PyUnicode_WSTR(unicode) = wstr;
783 _PyUnicode_WSTR(unicode)[length] = 0;
784 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200785 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000786 return 0;
787}
788
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789static PyObject*
790resize_copy(PyObject *unicode, Py_ssize_t length)
791{
792 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100793 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100795
Benjamin Petersonbac79492012-01-14 13:34:47 -0500796 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100797 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200798
799 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
800 if (copy == NULL)
801 return NULL;
802
803 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200804 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200805 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200806 }
807 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200808 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100809
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200810 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 if (w == NULL)
812 return NULL;
813 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
814 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200815 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
816 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200817 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200818 }
819}
820
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000822 Ux0000 terminated; some code (e.g. new_identifier)
823 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824
825 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000827
828*/
829
Alexander Belopolsky40018472011-02-26 01:02:56 +0000830static PyUnicodeObject *
831_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832{
833 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200834 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835
Thomas Wouters477c8d52006-05-27 19:21:47 +0000836 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000837 if (length == 0 && unicode_empty != NULL) {
838 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200839 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840 }
841
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000842 /* Ensure we won't overflow the size. */
843 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
844 return (PyUnicodeObject *)PyErr_NoMemory();
845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 if (length < 0) {
847 PyErr_SetString(PyExc_SystemError,
848 "Negative size passed to _PyUnicode_New");
849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000850 }
851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
853 if (unicode == NULL)
854 return NULL;
855 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
856 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
857 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100858 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000859 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100860 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200862
Jeremy Hyltond8082792003-09-16 19:41:39 +0000863 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000864 * the caller fails before initializing str -- unicode_resize()
865 * reads str[0], and the Keep-Alive optimization can keep memory
866 * allocated for str alive across a call to unicode_dealloc(unicode).
867 * We don't want unicode_resize to read uninitialized memory in
868 * that case.
869 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870 _PyUnicode_WSTR(unicode)[0] = 0;
871 _PyUnicode_WSTR(unicode)[length] = 0;
872 _PyUnicode_WSTR_LENGTH(unicode) = length;
873 _PyUnicode_HASH(unicode) = -1;
874 _PyUnicode_STATE(unicode).interned = 0;
875 _PyUnicode_STATE(unicode).kind = 0;
876 _PyUnicode_STATE(unicode).compact = 0;
877 _PyUnicode_STATE(unicode).ready = 0;
878 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200879 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200881 _PyUnicode_UTF8(unicode) = NULL;
882 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100883 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 return unicode;
885}
886
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887static const char*
888unicode_kind_name(PyObject *unicode)
889{
Victor Stinner42dfd712011-10-03 14:41:45 +0200890 /* don't check consistency: unicode_kind_name() is called from
891 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 if (!PyUnicode_IS_COMPACT(unicode))
893 {
894 if (!PyUnicode_IS_READY(unicode))
895 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600896 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 {
898 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200899 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 return "legacy ascii";
901 else
902 return "legacy latin1";
903 case PyUnicode_2BYTE_KIND:
904 return "legacy UCS2";
905 case PyUnicode_4BYTE_KIND:
906 return "legacy UCS4";
907 default:
908 return "<legacy invalid kind>";
909 }
910 }
911 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600912 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200913 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200914 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200915 return "ascii";
916 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200917 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200918 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200919 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200920 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200921 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200922 default:
923 return "<invalid compact kind>";
924 }
925}
926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928/* Functions wrapping macros for use in debugger */
929char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200930 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931}
932
933void *_PyUnicode_compact_data(void *unicode) {
934 return _PyUnicode_COMPACT_DATA(unicode);
935}
936void *_PyUnicode_data(void *unicode){
937 printf("obj %p\n", unicode);
938 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
939 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
940 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
941 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
942 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
943 return PyUnicode_DATA(unicode);
944}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200945
946void
947_PyUnicode_Dump(PyObject *op)
948{
949 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
951 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
952 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200953
Victor Stinnera849a4b2011-10-03 12:12:11 +0200954 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200955 {
956 if (ascii->state.ascii)
957 data = (ascii + 1);
958 else
959 data = (compact + 1);
960 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200961 else
962 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200963 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
964
Victor Stinnera849a4b2011-10-03 12:12:11 +0200965 if (ascii->wstr == data)
966 printf("shared ");
967 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200968
Victor Stinnera3b334d2011-10-03 13:53:37 +0200969 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200970 printf(" (%zu), ", compact->wstr_length);
971 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
972 printf("shared ");
973 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200974 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200975 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200976}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977#endif
978
979PyObject *
980PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
981{
982 PyObject *obj;
983 PyCompactUnicodeObject *unicode;
984 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200985 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200986 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 Py_ssize_t char_size;
988 Py_ssize_t struct_size;
989
990 /* Optimization for empty strings */
991 if (size == 0 && unicode_empty != NULL) {
992 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200993 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 }
995
Victor Stinner9e9d6892011-10-04 01:02:02 +0200996 is_ascii = 0;
997 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200998 struct_size = sizeof(PyCompactUnicodeObject);
999 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001000 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 char_size = 1;
1002 is_ascii = 1;
1003 struct_size = sizeof(PyASCIIObject);
1004 }
1005 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001006 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007 char_size = 1;
1008 }
1009 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001010 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001011 char_size = 2;
1012 if (sizeof(wchar_t) == 2)
1013 is_sharing = 1;
1014 }
1015 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001016 if (maxchar > MAX_UNICODE) {
1017 PyErr_SetString(PyExc_SystemError,
1018 "invalid maximum character passed to PyUnicode_New");
1019 return NULL;
1020 }
Victor Stinner8f825062012-04-27 13:55:39 +02001021 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001022 char_size = 4;
1023 if (sizeof(wchar_t) == 4)
1024 is_sharing = 1;
1025 }
1026
1027 /* Ensure we won't overflow the size. */
1028 if (size < 0) {
1029 PyErr_SetString(PyExc_SystemError,
1030 "Negative size passed to PyUnicode_New");
1031 return NULL;
1032 }
1033 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1034 return PyErr_NoMemory();
1035
1036 /* Duplicated allocation code from _PyObject_New() instead of a call to
1037 * PyObject_New() so we are able to allocate space for the object and
1038 * it's data buffer.
1039 */
1040 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1041 if (obj == NULL)
1042 return PyErr_NoMemory();
1043 obj = PyObject_INIT(obj, &PyUnicode_Type);
1044 if (obj == NULL)
1045 return NULL;
1046
1047 unicode = (PyCompactUnicodeObject *)obj;
1048 if (is_ascii)
1049 data = ((PyASCIIObject*)obj) + 1;
1050 else
1051 data = unicode + 1;
1052 _PyUnicode_LENGTH(unicode) = size;
1053 _PyUnicode_HASH(unicode) = -1;
1054 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001055 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 _PyUnicode_STATE(unicode).compact = 1;
1057 _PyUnicode_STATE(unicode).ready = 1;
1058 _PyUnicode_STATE(unicode).ascii = is_ascii;
1059 if (is_ascii) {
1060 ((char*)data)[size] = 0;
1061 _PyUnicode_WSTR(unicode) = NULL;
1062 }
Victor Stinner8f825062012-04-27 13:55:39 +02001063 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 ((char*)data)[size] = 0;
1065 _PyUnicode_WSTR(unicode) = NULL;
1066 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001068 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 else {
1071 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001072 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001073 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001075 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001076 ((Py_UCS4*)data)[size] = 0;
1077 if (is_sharing) {
1078 _PyUnicode_WSTR_LENGTH(unicode) = size;
1079 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1080 }
1081 else {
1082 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1083 _PyUnicode_WSTR(unicode) = NULL;
1084 }
1085 }
Victor Stinner8f825062012-04-27 13:55:39 +02001086#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001087 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001088#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001089 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 return obj;
1091}
1092
1093#if SIZEOF_WCHAR_T == 2
1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1095 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001096 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097
1098 This function assumes that unicode can hold one more code point than wstr
1099 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001100static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103{
1104 const wchar_t *iter;
1105 Py_UCS4 *ucs4_out;
1106
Victor Stinner910337b2011-10-03 03:20:16 +02001107 assert(unicode != NULL);
1108 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1110 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1111
1112 for (iter = begin; iter < end; ) {
1113 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1114 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001115 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1116 && (iter+1) < end
1117 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 {
Victor Stinner551ac952011-11-29 22:58:13 +01001119 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 iter += 2;
1121 }
1122 else {
1123 *ucs4_out++ = *iter;
1124 iter++;
1125 }
1126 }
1127 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1128 _PyUnicode_GET_LENGTH(unicode)));
1129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130}
1131#endif
1132
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133static int
Victor Stinner488fa492011-12-12 00:01:39 +01001134unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001135{
Victor Stinner488fa492011-12-12 00:01:39 +01001136 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001137 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001138 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return -1;
1140 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141 return 0;
1142}
1143
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144static int
1145_copy_characters(PyObject *to, Py_ssize_t to_start,
1146 PyObject *from, Py_ssize_t from_start,
1147 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 unsigned int from_kind, to_kind;
1150 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151
Victor Stinneree4544c2012-05-09 22:24:08 +02001152 assert(0 <= how_many);
1153 assert(0 <= from_start);
1154 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001155 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001157 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158
Victor Stinnerd3f08822012-05-29 12:57:52 +02001159 assert(PyUnicode_Check(to));
1160 assert(PyUnicode_IS_READY(to));
1161 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1162
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001163 if (how_many == 0)
1164 return 0;
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001167 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170
Victor Stinnerf1852262012-06-16 16:38:26 +02001171#ifdef Py_DEBUG
1172 if (!check_maxchar
1173 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1174 {
1175 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1176 Py_UCS4 ch;
1177 Py_ssize_t i;
1178 for (i=0; i < how_many; i++) {
1179 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1180 assert(ch <= to_maxchar);
1181 }
1182 }
1183#endif
1184
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001185 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001186 if (check_maxchar
1187 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1188 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001189 /* Writing Latin-1 characters into an ASCII string requires to
1190 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001191 Py_UCS4 max_char;
1192 max_char = ucs1lib_find_max_char(from_data,
1193 (Py_UCS1*)from_data + how_many);
1194 if (max_char >= 128)
1195 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001196 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001197 Py_MEMCPY((char*)to_data + to_kind * to_start,
1198 (char*)from_data + from_kind * from_start,
1199 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001201 else if (from_kind == PyUnicode_1BYTE_KIND
1202 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 {
1204 _PyUnicode_CONVERT_BYTES(
1205 Py_UCS1, Py_UCS2,
1206 PyUnicode_1BYTE_DATA(from) + from_start,
1207 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1208 PyUnicode_2BYTE_DATA(to) + to_start
1209 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001210 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001211 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001212 && to_kind == PyUnicode_4BYTE_KIND)
1213 {
1214 _PyUnicode_CONVERT_BYTES(
1215 Py_UCS1, Py_UCS4,
1216 PyUnicode_1BYTE_DATA(from) + from_start,
1217 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1218 PyUnicode_4BYTE_DATA(to) + to_start
1219 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001220 }
1221 else if (from_kind == PyUnicode_2BYTE_KIND
1222 && to_kind == PyUnicode_4BYTE_KIND)
1223 {
1224 _PyUnicode_CONVERT_BYTES(
1225 Py_UCS2, Py_UCS4,
1226 PyUnicode_2BYTE_DATA(from) + from_start,
1227 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1228 PyUnicode_4BYTE_DATA(to) + to_start
1229 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001230 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001231 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001232 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1233
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001234 if (!check_maxchar) {
1235 if (from_kind == PyUnicode_2BYTE_KIND
1236 && to_kind == PyUnicode_1BYTE_KIND)
1237 {
1238 _PyUnicode_CONVERT_BYTES(
1239 Py_UCS2, Py_UCS1,
1240 PyUnicode_2BYTE_DATA(from) + from_start,
1241 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1242 PyUnicode_1BYTE_DATA(to) + to_start
1243 );
1244 }
1245 else if (from_kind == PyUnicode_4BYTE_KIND
1246 && to_kind == PyUnicode_1BYTE_KIND)
1247 {
1248 _PyUnicode_CONVERT_BYTES(
1249 Py_UCS4, Py_UCS1,
1250 PyUnicode_4BYTE_DATA(from) + from_start,
1251 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1252 PyUnicode_1BYTE_DATA(to) + to_start
1253 );
1254 }
1255 else if (from_kind == PyUnicode_4BYTE_KIND
1256 && to_kind == PyUnicode_2BYTE_KIND)
1257 {
1258 _PyUnicode_CONVERT_BYTES(
1259 Py_UCS4, Py_UCS2,
1260 PyUnicode_4BYTE_DATA(from) + from_start,
1261 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1262 PyUnicode_2BYTE_DATA(to) + to_start
1263 );
1264 }
1265 else {
1266 assert(0);
1267 return -1;
1268 }
1269 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001270 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001271 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001272 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001273 Py_ssize_t i;
1274
Victor Stinnera0702ab2011-09-29 14:14:38 +02001275 for (i=0; i < how_many; i++) {
1276 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001277 if (ch > to_maxchar)
1278 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001279 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1280 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001281 }
1282 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001283 return 0;
1284}
1285
Victor Stinnerd3f08822012-05-29 12:57:52 +02001286void
1287_PyUnicode_FastCopyCharacters(
1288 PyObject *to, Py_ssize_t to_start,
1289 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001290{
1291 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1292}
1293
1294Py_ssize_t
1295PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1296 PyObject *from, Py_ssize_t from_start,
1297 Py_ssize_t how_many)
1298{
1299 int err;
1300
1301 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1302 PyErr_BadInternalCall();
1303 return -1;
1304 }
1305
Benjamin Petersonbac79492012-01-14 13:34:47 -05001306 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001307 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001308 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001309 return -1;
1310
Victor Stinnerd3f08822012-05-29 12:57:52 +02001311 if (from_start < 0) {
1312 PyErr_SetString(PyExc_IndexError, "string index out of range");
1313 return -1;
1314 }
1315 if (to_start < 0) {
1316 PyErr_SetString(PyExc_IndexError, "string index out of range");
1317 return -1;
1318 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001319 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1320 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1321 PyErr_Format(PyExc_SystemError,
1322 "Cannot write %zi characters at %zi "
1323 "in a string of %zi characters",
1324 how_many, to_start, PyUnicode_GET_LENGTH(to));
1325 return -1;
1326 }
1327
1328 if (how_many == 0)
1329 return 0;
1330
Victor Stinner488fa492011-12-12 00:01:39 +01001331 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001332 return -1;
1333
1334 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1335 if (err) {
1336 PyErr_Format(PyExc_SystemError,
1337 "Cannot copy %s characters "
1338 "into a string of %s characters",
1339 unicode_kind_name(from),
1340 unicode_kind_name(to));
1341 return -1;
1342 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001343 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344}
1345
Victor Stinner17222162011-09-28 22:15:37 +02001346/* Find the maximum code point and count the number of surrogate pairs so a
1347 correct string length can be computed before converting a string to UCS4.
1348 This function counts single surrogates as a character and not as a pair.
1349
1350 Return 0 on success, or -1 on error. */
1351static int
1352find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1353 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354{
1355 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001356 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357
Victor Stinnerc53be962011-10-02 21:33:54 +02001358 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 *num_surrogates = 0;
1360 *maxchar = 0;
1361
1362 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001364 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1365 && (iter+1) < end
1366 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001368 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 iter += 2;
1371 }
1372 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001374 {
1375 ch = *iter;
1376 iter++;
1377 }
1378 if (ch > *maxchar) {
1379 *maxchar = ch;
1380 if (*maxchar > MAX_UNICODE) {
1381 PyErr_Format(PyExc_ValueError,
1382 "character U+%x is not in range [U+0000; U+10ffff]",
1383 ch);
1384 return -1;
1385 }
1386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 }
1388 return 0;
1389}
1390
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001391int
1392_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393{
1394 wchar_t *end;
1395 Py_UCS4 maxchar = 0;
1396 Py_ssize_t num_surrogates;
1397#if SIZEOF_WCHAR_T == 2
1398 Py_ssize_t length_wo_surrogates;
1399#endif
1400
Georg Brandl7597add2011-10-05 16:36:47 +02001401 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001402 strings were created using _PyObject_New() and where no canonical
1403 representation (the str field) has been set yet aka strings
1404 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001405 assert(_PyUnicode_CHECK(unicode));
1406 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001408 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001409 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001410 /* Actually, it should neither be interned nor be anything else: */
1411 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001414 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001415 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417
1418 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001419 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1420 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 PyErr_NoMemory();
1422 return -1;
1423 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001424 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 _PyUnicode_WSTR(unicode), end,
1426 PyUnicode_1BYTE_DATA(unicode));
1427 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1428 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1429 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1430 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001431 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001432 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001433 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 }
1435 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001436 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001437 _PyUnicode_UTF8(unicode) = NULL;
1438 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 }
1440 PyObject_FREE(_PyUnicode_WSTR(unicode));
1441 _PyUnicode_WSTR(unicode) = NULL;
1442 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1443 }
1444 /* In this case we might have to convert down from 4-byte native
1445 wchar_t to 2-byte unicode. */
1446 else if (maxchar < 65536) {
1447 assert(num_surrogates == 0 &&
1448 "FindMaxCharAndNumSurrogatePairs() messed up");
1449
Victor Stinner506f5922011-09-28 22:34:18 +02001450#if SIZEOF_WCHAR_T == 2
1451 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001452 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001453 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1454 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1455 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 _PyUnicode_UTF8(unicode) = NULL;
1457 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001458#else
1459 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001460 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001461 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001462 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001463 PyErr_NoMemory();
1464 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 }
Victor Stinner506f5922011-09-28 22:34:18 +02001466 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1467 _PyUnicode_WSTR(unicode), end,
1468 PyUnicode_2BYTE_DATA(unicode));
1469 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1470 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1471 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001472 _PyUnicode_UTF8(unicode) = NULL;
1473 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001474 PyObject_FREE(_PyUnicode_WSTR(unicode));
1475 _PyUnicode_WSTR(unicode) = NULL;
1476 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1477#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 }
1479 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1480 else {
1481#if SIZEOF_WCHAR_T == 2
1482 /* in case the native representation is 2-bytes, we need to allocate a
1483 new normalized 4-byte version. */
1484 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001485 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1486 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 PyErr_NoMemory();
1488 return -1;
1489 }
1490 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1491 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001492 _PyUnicode_UTF8(unicode) = NULL;
1493 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001494 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1495 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001496 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 PyObject_FREE(_PyUnicode_WSTR(unicode));
1498 _PyUnicode_WSTR(unicode) = NULL;
1499 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1500#else
1501 assert(num_surrogates == 0);
1502
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001505 _PyUnicode_UTF8(unicode) = NULL;
1506 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1508#endif
1509 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1510 }
1511 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001512 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001513 return 0;
1514}
1515
Alexander Belopolsky40018472011-02-26 01:02:56 +00001516static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001517unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518{
Walter Dörwald16807132007-05-25 13:52:07 +00001519 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001520 case SSTATE_NOT_INTERNED:
1521 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001522
Benjamin Peterson29060642009-01-31 22:14:21 +00001523 case SSTATE_INTERNED_MORTAL:
1524 /* revive dead object temporarily for DelItem */
1525 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001526 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001527 Py_FatalError(
1528 "deletion of interned string failed");
1529 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 case SSTATE_INTERNED_IMMORTAL:
1532 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001533
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 default:
1535 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001536 }
1537
Victor Stinner03490912011-10-03 23:45:12 +02001538 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001539 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001540 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001541 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001542 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1543 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001545 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001546}
1547
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548#ifdef Py_DEBUG
1549static int
1550unicode_is_singleton(PyObject *unicode)
1551{
1552 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1553 if (unicode == unicode_empty)
1554 return 1;
1555 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1556 {
1557 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1558 if (ch < 256 && unicode_latin1[ch] == unicode)
1559 return 1;
1560 }
1561 return 0;
1562}
1563#endif
1564
Alexander Belopolsky40018472011-02-26 01:02:56 +00001565static int
Victor Stinner488fa492011-12-12 00:01:39 +01001566unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001567{
Victor Stinner488fa492011-12-12 00:01:39 +01001568 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001569 if (Py_REFCNT(unicode) != 1)
1570 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (_PyUnicode_HASH(unicode) != -1)
1572 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001573 if (PyUnicode_CHECK_INTERNED(unicode))
1574 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001575 if (!PyUnicode_CheckExact(unicode))
1576 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001577#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001578 /* singleton refcount is greater than 1 */
1579 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001580#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 return 1;
1582}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584static int
1585unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1586{
1587 PyObject *unicode;
1588 Py_ssize_t old_length;
1589
1590 assert(p_unicode != NULL);
1591 unicode = *p_unicode;
1592
1593 assert(unicode != NULL);
1594 assert(PyUnicode_Check(unicode));
1595 assert(0 <= length);
1596
Victor Stinner910337b2011-10-03 03:20:16 +02001597 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001598 old_length = PyUnicode_WSTR_LENGTH(unicode);
1599 else
1600 old_length = PyUnicode_GET_LENGTH(unicode);
1601 if (old_length == length)
1602 return 0;
1603
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001604 if (length == 0) {
1605 Py_DECREF(*p_unicode);
1606 *p_unicode = unicode_empty;
1607 Py_INCREF(*p_unicode);
1608 return 0;
1609 }
1610
Victor Stinner488fa492011-12-12 00:01:39 +01001611 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001612 PyObject *copy = resize_copy(unicode, length);
1613 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001614 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 Py_DECREF(*p_unicode);
1616 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618 }
1619
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001621 PyObject *new_unicode = resize_compact(unicode, length);
1622 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001624 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001625 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001626 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001627 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001628}
1629
Alexander Belopolsky40018472011-02-26 01:02:56 +00001630int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 PyObject *unicode;
1634 if (p_unicode == NULL) {
1635 PyErr_BadInternalCall();
1636 return -1;
1637 }
1638 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001639 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001640 {
1641 PyErr_BadInternalCall();
1642 return -1;
1643 }
1644 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001645}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001646
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001648unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1649 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001650{
1651 PyObject *result;
1652 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001653 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001654 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1655 return 0;
1656 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1657 maxchar);
1658 if (result == NULL)
1659 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001660 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 Py_DECREF(*p_unicode);
1662 *p_unicode = result;
1663 return 0;
1664}
1665
1666static int
1667unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1668 Py_UCS4 ch)
1669{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001670 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001671 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001672 return -1;
1673 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1674 PyUnicode_DATA(*p_unicode),
1675 (*pos)++, ch);
1676 return 0;
1677}
1678
Victor Stinnerc5166102012-02-22 13:55:02 +01001679/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001680
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001681 WARNING: The function doesn't copy the terminating null character and
1682 doesn't check the maximum character (may write a latin1 character in an
1683 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001684static void
1685unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1686 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001687{
1688 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1689 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001690 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001691
1692 switch (kind) {
1693 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001694 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001695#ifdef Py_DEBUG
1696 if (PyUnicode_IS_ASCII(unicode)) {
1697 Py_UCS4 maxchar = ucs1lib_find_max_char(
1698 (const Py_UCS1*)str,
1699 (const Py_UCS1*)str + len);
1700 assert(maxchar < 128);
1701 }
1702#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001703 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001704 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001705 }
1706 case PyUnicode_2BYTE_KIND: {
1707 Py_UCS2 *start = (Py_UCS2 *)data + index;
1708 Py_UCS2 *ucs2 = start;
1709 assert(index <= PyUnicode_GET_LENGTH(unicode));
1710
Victor Stinner184252a2012-06-16 02:57:41 +02001711 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001712 *ucs2 = (Py_UCS2)*str;
1713
1714 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001715 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001716 }
1717 default: {
1718 Py_UCS4 *start = (Py_UCS4 *)data + index;
1719 Py_UCS4 *ucs4 = start;
1720 assert(kind == PyUnicode_4BYTE_KIND);
1721 assert(index <= PyUnicode_GET_LENGTH(unicode));
1722
Victor Stinner184252a2012-06-16 02:57:41 +02001723 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001724 *ucs4 = (Py_UCS4)*str;
1725
1726 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001727 }
1728 }
1729}
1730
1731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732static PyObject*
1733get_latin1_char(unsigned char ch)
1734{
Victor Stinnera464fc12011-10-02 20:39:30 +02001735 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001737 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 if (!unicode)
1739 return NULL;
1740 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001741 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742 unicode_latin1[ch] = unicode;
1743 }
1744 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001745 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746}
1747
Alexander Belopolsky40018472011-02-26 01:02:56 +00001748PyObject *
1749PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001751 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 Py_UCS4 maxchar = 0;
1753 Py_ssize_t num_surrogates;
1754
1755 if (u == NULL)
1756 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001758 /* If the Unicode data is known at construction time, we can apply
1759 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 /* Optimization for empty strings */
1762 if (size == 0 && unicode_empty != NULL) {
1763 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001764 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001765 }
Tim Petersced69f82003-09-16 20:30:58 +00001766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 /* Single character Unicode objects in the Latin-1 range are
1768 shared when using this constructor */
1769 if (size == 1 && *u < 256)
1770 return get_latin1_char((unsigned char)*u);
1771
1772 /* If not empty and not single character, copy the Unicode data
1773 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001774 if (find_maxchar_surrogates(u, u + size,
1775 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 return NULL;
1777
Victor Stinner8faf8212011-12-08 22:14:11 +01001778 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 if (!unicode)
1780 return NULL;
1781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 switch (PyUnicode_KIND(unicode)) {
1783 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001784 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1786 break;
1787 case PyUnicode_2BYTE_KIND:
1788#if Py_UNICODE_SIZE == 2
1789 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1790#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001791 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1793#endif
1794 break;
1795 case PyUnicode_4BYTE_KIND:
1796#if SIZEOF_WCHAR_T == 2
1797 /* This is the only case which has to process surrogates, thus
1798 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001799 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800#else
1801 assert(num_surrogates == 0);
1802 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1803#endif
1804 break;
1805 default:
1806 assert(0 && "Impossible state");
1807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001809 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810}
1811
Alexander Belopolsky40018472011-02-26 01:02:56 +00001812PyObject *
1813PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001814{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001815 if (size < 0) {
1816 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001817 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001818 return NULL;
1819 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001820 if (u != NULL)
1821 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1822 else
1823 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001824}
1825
Alexander Belopolsky40018472011-02-26 01:02:56 +00001826PyObject *
1827PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001828{
1829 size_t size = strlen(u);
1830 if (size > PY_SSIZE_T_MAX) {
1831 PyErr_SetString(PyExc_OverflowError, "input too long");
1832 return NULL;
1833 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001834 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001835}
1836
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001837PyObject *
1838_PyUnicode_FromId(_Py_Identifier *id)
1839{
1840 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001841 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1842 strlen(id->string),
1843 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001844 if (!id->object)
1845 return NULL;
1846 PyUnicode_InternInPlace(&id->object);
1847 assert(!id->next);
1848 id->next = static_strings;
1849 static_strings = id;
1850 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001851 return id->object;
1852}
1853
1854void
1855_PyUnicode_ClearStaticStrings()
1856{
1857 _Py_Identifier *i;
1858 for (i = static_strings; i; i = i->next) {
1859 Py_DECREF(i->object);
1860 i->object = NULL;
1861 i->next = NULL;
1862 }
1863}
1864
Benjamin Peterson0df54292012-03-26 14:50:32 -04001865/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001866
Victor Stinnerd3f08822012-05-29 12:57:52 +02001867PyObject*
1868_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001869{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001870 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001871 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001872 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001873#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001874 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001875#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001876 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001877 }
Victor Stinner785938e2011-12-11 20:09:03 +01001878 unicode = PyUnicode_New(size, 127);
1879 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001880 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001881 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1882 assert(_PyUnicode_CheckConsistency(unicode, 1));
1883 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001884}
1885
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001886static Py_UCS4
1887kind_maxchar_limit(unsigned int kind)
1888{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001889 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001890 case PyUnicode_1BYTE_KIND:
1891 return 0x80;
1892 case PyUnicode_2BYTE_KIND:
1893 return 0x100;
1894 case PyUnicode_4BYTE_KIND:
1895 return 0x10000;
1896 default:
1897 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001898 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001899 }
1900}
1901
Victor Stinnere6abb482012-05-02 01:15:40 +02001902Py_LOCAL_INLINE(Py_UCS4)
1903align_maxchar(Py_UCS4 maxchar)
1904{
1905 if (maxchar <= 127)
1906 return 127;
1907 else if (maxchar <= 255)
1908 return 255;
1909 else if (maxchar <= 65535)
1910 return 65535;
1911 else
1912 return MAX_UNICODE;
1913}
1914
Victor Stinner702c7342011-10-05 13:50:52 +02001915static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001916_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001917{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001919 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001920
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001921 if (size == 0) {
1922 Py_INCREF(unicode_empty);
1923 return unicode_empty;
1924 }
1925 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001926 if (size == 1)
1927 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001928
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001929 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001930 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001931 if (!res)
1932 return NULL;
1933 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001934 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001936}
1937
Victor Stinnere57b1c02011-09-28 22:20:48 +02001938static PyObject*
1939_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940{
1941 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001942 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001943
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001944 if (size == 0) {
1945 Py_INCREF(unicode_empty);
1946 return unicode_empty;
1947 }
1948 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001949 if (size == 1) {
1950 Py_UCS4 ch = u[0];
1951 if (ch < 256)
1952 return get_latin1_char((unsigned char)ch);
1953
1954 res = PyUnicode_New(1, ch);
1955 if (res == NULL)
1956 return NULL;
1957 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1958 assert(_PyUnicode_CheckConsistency(res, 1));
1959 return res;
1960 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001961
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001962 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001963 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 if (!res)
1965 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001966 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001968 else {
1969 _PyUnicode_CONVERT_BYTES(
1970 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1971 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001972 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 return res;
1974}
1975
Victor Stinnere57b1c02011-09-28 22:20:48 +02001976static PyObject*
1977_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978{
1979 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001980 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001981
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001982 if (size == 0) {
1983 Py_INCREF(unicode_empty);
1984 return unicode_empty;
1985 }
1986 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001987 if (size == 1) {
1988 Py_UCS4 ch = u[0];
1989 if (ch < 256)
1990 return get_latin1_char((unsigned char)ch);
1991
1992 res = PyUnicode_New(1, ch);
1993 if (res == NULL)
1994 return NULL;
1995 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1996 assert(_PyUnicode_CheckConsistency(res, 1));
1997 return res;
1998 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001999
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002000 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002001 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 if (!res)
2003 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002004 if (max_char < 256)
2005 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2006 PyUnicode_1BYTE_DATA(res));
2007 else if (max_char < 0x10000)
2008 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2009 PyUnicode_2BYTE_DATA(res));
2010 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002012 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 return res;
2014}
2015
2016PyObject*
2017PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2018{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002019 if (size < 0) {
2020 PyErr_SetString(PyExc_ValueError, "size must be positive");
2021 return NULL;
2022 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002023 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002025 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002027 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002029 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002030 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002031 PyErr_SetString(PyExc_SystemError, "invalid kind");
2032 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034}
2035
Victor Stinnerece58de2012-04-23 23:36:38 +02002036Py_UCS4
2037_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2038{
2039 enum PyUnicode_Kind kind;
2040 void *startptr, *endptr;
2041
2042 assert(PyUnicode_IS_READY(unicode));
2043 assert(0 <= start);
2044 assert(end <= PyUnicode_GET_LENGTH(unicode));
2045 assert(start <= end);
2046
2047 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2048 return PyUnicode_MAX_CHAR_VALUE(unicode);
2049
2050 if (start == end)
2051 return 127;
2052
Victor Stinner94d558b2012-04-27 22:26:58 +02002053 if (PyUnicode_IS_ASCII(unicode))
2054 return 127;
2055
Victor Stinnerece58de2012-04-23 23:36:38 +02002056 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002057 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002058 endptr = (char *)startptr + end * kind;
2059 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002060 switch(kind) {
2061 case PyUnicode_1BYTE_KIND:
2062 return ucs1lib_find_max_char(startptr, endptr);
2063 case PyUnicode_2BYTE_KIND:
2064 return ucs2lib_find_max_char(startptr, endptr);
2065 case PyUnicode_4BYTE_KIND:
2066 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002067 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002068 assert(0);
2069 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002070 }
2071}
2072
Victor Stinner25a4b292011-10-06 12:31:55 +02002073/* Ensure that a string uses the most efficient storage, if it is not the
2074 case: create a new string with of the right kind. Write NULL into *p_unicode
2075 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002076static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002077unicode_adjust_maxchar(PyObject **p_unicode)
2078{
2079 PyObject *unicode, *copy;
2080 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002081 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002082 unsigned int kind;
2083
2084 assert(p_unicode != NULL);
2085 unicode = *p_unicode;
2086 assert(PyUnicode_IS_READY(unicode));
2087 if (PyUnicode_IS_ASCII(unicode))
2088 return;
2089
2090 len = PyUnicode_GET_LENGTH(unicode);
2091 kind = PyUnicode_KIND(unicode);
2092 if (kind == PyUnicode_1BYTE_KIND) {
2093 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002094 max_char = ucs1lib_find_max_char(u, u + len);
2095 if (max_char >= 128)
2096 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002097 }
2098 else if (kind == PyUnicode_2BYTE_KIND) {
2099 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002100 max_char = ucs2lib_find_max_char(u, u + len);
2101 if (max_char >= 256)
2102 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002103 }
2104 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002105 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002106 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002107 max_char = ucs4lib_find_max_char(u, u + len);
2108 if (max_char >= 0x10000)
2109 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002110 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002111 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002112 if (copy != NULL)
2113 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002114 Py_DECREF(unicode);
2115 *p_unicode = copy;
2116}
2117
Victor Stinner034f6cf2011-09-30 02:26:44 +02002118PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002119_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002120{
Victor Stinner87af4f22011-11-21 23:03:47 +01002121 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002122 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002123
Victor Stinner034f6cf2011-09-30 02:26:44 +02002124 if (!PyUnicode_Check(unicode)) {
2125 PyErr_BadInternalCall();
2126 return NULL;
2127 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002128 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002129 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002130
Victor Stinner87af4f22011-11-21 23:03:47 +01002131 length = PyUnicode_GET_LENGTH(unicode);
2132 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002133 if (!copy)
2134 return NULL;
2135 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2136
Victor Stinner87af4f22011-11-21 23:03:47 +01002137 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2138 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002139 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002140 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002141}
2142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002143
Victor Stinnerbc603d12011-10-02 01:00:40 +02002144/* Widen Unicode objects to larger buffers. Don't write terminating null
2145 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146
2147void*
2148_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2149{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002150 Py_ssize_t len;
2151 void *result;
2152 unsigned int skind;
2153
Benjamin Petersonbac79492012-01-14 13:34:47 -05002154 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002155 return NULL;
2156
2157 len = PyUnicode_GET_LENGTH(s);
2158 skind = PyUnicode_KIND(s);
2159 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002160 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002161 return NULL;
2162 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002163 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002164 case PyUnicode_2BYTE_KIND:
2165 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2166 if (!result)
2167 return PyErr_NoMemory();
2168 assert(skind == PyUnicode_1BYTE_KIND);
2169 _PyUnicode_CONVERT_BYTES(
2170 Py_UCS1, Py_UCS2,
2171 PyUnicode_1BYTE_DATA(s),
2172 PyUnicode_1BYTE_DATA(s) + len,
2173 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002175 case PyUnicode_4BYTE_KIND:
2176 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2177 if (!result)
2178 return PyErr_NoMemory();
2179 if (skind == PyUnicode_2BYTE_KIND) {
2180 _PyUnicode_CONVERT_BYTES(
2181 Py_UCS2, Py_UCS4,
2182 PyUnicode_2BYTE_DATA(s),
2183 PyUnicode_2BYTE_DATA(s) + len,
2184 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002186 else {
2187 assert(skind == PyUnicode_1BYTE_KIND);
2188 _PyUnicode_CONVERT_BYTES(
2189 Py_UCS1, Py_UCS4,
2190 PyUnicode_1BYTE_DATA(s),
2191 PyUnicode_1BYTE_DATA(s) + len,
2192 result);
2193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002195 default:
2196 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 }
Victor Stinner01698042011-10-04 00:04:26 +02002198 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 return NULL;
2200}
2201
2202static Py_UCS4*
2203as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2204 int copy_null)
2205{
2206 int kind;
2207 void *data;
2208 Py_ssize_t len, targetlen;
2209 if (PyUnicode_READY(string) == -1)
2210 return NULL;
2211 kind = PyUnicode_KIND(string);
2212 data = PyUnicode_DATA(string);
2213 len = PyUnicode_GET_LENGTH(string);
2214 targetlen = len;
2215 if (copy_null)
2216 targetlen++;
2217 if (!target) {
2218 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2219 PyErr_NoMemory();
2220 return NULL;
2221 }
2222 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2223 if (!target) {
2224 PyErr_NoMemory();
2225 return NULL;
2226 }
2227 }
2228 else {
2229 if (targetsize < targetlen) {
2230 PyErr_Format(PyExc_SystemError,
2231 "string is longer than the buffer");
2232 if (copy_null && 0 < targetsize)
2233 target[0] = 0;
2234 return NULL;
2235 }
2236 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002237 if (kind == PyUnicode_1BYTE_KIND) {
2238 Py_UCS1 *start = (Py_UCS1 *) data;
2239 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002241 else if (kind == PyUnicode_2BYTE_KIND) {
2242 Py_UCS2 *start = (Py_UCS2 *) data;
2243 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2244 }
2245 else {
2246 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 if (copy_null)
2250 target[len] = 0;
2251 return target;
2252}
2253
2254Py_UCS4*
2255PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2256 int copy_null)
2257{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002258 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 PyErr_BadInternalCall();
2260 return NULL;
2261 }
2262 return as_ucs4(string, target, targetsize, copy_null);
2263}
2264
2265Py_UCS4*
2266PyUnicode_AsUCS4Copy(PyObject *string)
2267{
2268 return as_ucs4(string, NULL, 0, 1);
2269}
2270
2271#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002272
Alexander Belopolsky40018472011-02-26 01:02:56 +00002273PyObject *
2274PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002275{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002277 if (size == 0) {
2278 Py_INCREF(unicode_empty);
2279 return unicode_empty;
2280 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002281 PyErr_BadInternalCall();
2282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283 }
2284
Martin v. Löwis790465f2008-04-05 20:41:37 +00002285 if (size == -1) {
2286 size = wcslen(w);
2287 }
2288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290}
2291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002292#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002293
Walter Dörwald346737f2007-05-31 10:44:43 +00002294static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002295makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002296 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002297{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002298 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002299 if (longflag)
2300 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002301 else if (longlongflag) {
2302 /* longlongflag should only ever be nonzero on machines with
2303 HAVE_LONG_LONG defined */
2304#ifdef HAVE_LONG_LONG
2305 char *f = PY_FORMAT_LONG_LONG;
2306 while (*f)
2307 *fmt++ = *f++;
2308#else
2309 /* we shouldn't ever get here */
2310 assert(0);
2311 *fmt++ = 'l';
2312#endif
2313 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002314 else if (size_tflag) {
2315 char *f = PY_FORMAT_SIZE_T;
2316 while (*f)
2317 *fmt++ = *f++;
2318 }
2319 *fmt++ = c;
2320 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002321}
2322
Victor Stinner15a11362012-10-06 23:48:20 +02002323/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002324 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2325 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2326#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002327
2328static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002329unicode_fromformat_arg(_PyUnicodeWriter *writer,
2330 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002331{
Victor Stinnere215d962012-10-06 23:03:36 +02002332 const char *p;
2333 Py_ssize_t len;
2334 int zeropad;
2335 int width;
2336 int precision;
2337 int longflag;
2338 int longlongflag;
2339 int size_tflag;
2340 int fill;
2341
2342 p = f;
2343 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002344 zeropad = 0;
2345 if (*f == '0') {
2346 zeropad = 1;
2347 f++;
2348 }
Victor Stinner96865452011-03-01 23:44:09 +00002349
2350 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002351 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002352 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002353 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2354 PyErr_SetString(PyExc_ValueError,
2355 "width too big");
2356 return NULL;
2357 }
Victor Stinnere215d962012-10-06 23:03:36 +02002358 width = (width*10) + (*f - '0');
2359 f++;
2360 }
Victor Stinner96865452011-03-01 23:44:09 +00002361 precision = 0;
2362 if (*f == '.') {
2363 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002364 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002365 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2366 PyErr_SetString(PyExc_ValueError,
2367 "precision too big");
2368 return NULL;
2369 }
Victor Stinnere215d962012-10-06 23:03:36 +02002370 precision = (precision*10) + (*f - '0');
2371 f++;
2372 }
Victor Stinner96865452011-03-01 23:44:09 +00002373 if (*f == '%') {
2374 /* "%.3%s" => f points to "3" */
2375 f--;
2376 }
2377 }
2378 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002379 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002380 f--;
2381 }
Victor Stinner96865452011-03-01 23:44:09 +00002382
2383 /* Handle %ld, %lu, %lld and %llu. */
2384 longflag = 0;
2385 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002386 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002387 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002388 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002389 longflag = 1;
2390 ++f;
2391 }
2392#ifdef HAVE_LONG_LONG
2393 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002394 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002395 longlongflag = 1;
2396 f += 2;
2397 }
2398#endif
2399 }
2400 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002401 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002402 size_tflag = 1;
2403 ++f;
2404 }
Victor Stinnere215d962012-10-06 23:03:36 +02002405
2406 if (f[1] == '\0')
2407 writer->overallocate = 0;
2408
2409 switch (*f) {
2410 case 'c':
2411 {
2412 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002413 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2414 PyErr_SetString(PyExc_ValueError,
2415 "character argument not in range(0x110000)");
2416 return NULL;
2417 }
Victor Stinnere215d962012-10-06 23:03:36 +02002418 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2419 return NULL;
2420 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2421 writer->pos++;
2422 break;
2423 }
2424
2425 case 'i':
2426 case 'd':
2427 case 'u':
2428 case 'x':
2429 {
2430 /* used by sprintf */
2431 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002432 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002433
2434 if (*f == 'u') {
2435 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2436
2437 if (longflag)
2438 len = sprintf(buffer, fmt,
2439 va_arg(*vargs, unsigned long));
2440#ifdef HAVE_LONG_LONG
2441 else if (longlongflag)
2442 len = sprintf(buffer, fmt,
2443 va_arg(*vargs, unsigned PY_LONG_LONG));
2444#endif
2445 else if (size_tflag)
2446 len = sprintf(buffer, fmt,
2447 va_arg(*vargs, size_t));
2448 else
2449 len = sprintf(buffer, fmt,
2450 va_arg(*vargs, unsigned int));
2451 }
2452 else if (*f == 'x') {
2453 makefmt(fmt, 0, 0, 0, 'x');
2454 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2455 }
2456 else {
2457 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2458
2459 if (longflag)
2460 len = sprintf(buffer, fmt,
2461 va_arg(*vargs, long));
2462#ifdef HAVE_LONG_LONG
2463 else if (longlongflag)
2464 len = sprintf(buffer, fmt,
2465 va_arg(*vargs, PY_LONG_LONG));
2466#endif
2467 else if (size_tflag)
2468 len = sprintf(buffer, fmt,
2469 va_arg(*vargs, Py_ssize_t));
2470 else
2471 len = sprintf(buffer, fmt,
2472 va_arg(*vargs, int));
2473 }
2474 assert(len >= 0);
2475
Victor Stinnere215d962012-10-06 23:03:36 +02002476 if (precision < len)
2477 precision = len;
2478 if (width > precision) {
2479 Py_UCS4 fillchar;
2480 fill = width - precision;
2481 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002482 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2483 return NULL;
2484 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2485 return NULL;
2486 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002487 }
Victor Stinner15a11362012-10-06 23:48:20 +02002488 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002489 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002490 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2491 return NULL;
2492 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2493 return NULL;
2494 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002495 }
Victor Stinner15a11362012-10-06 23:48:20 +02002496 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002497 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002498 break;
2499 }
2500
2501 case 'p':
2502 {
2503 char number[MAX_LONG_LONG_CHARS];
2504
2505 len = sprintf(number, "%p", va_arg(*vargs, void*));
2506 assert(len >= 0);
2507
2508 /* %p is ill-defined: ensure leading 0x. */
2509 if (number[1] == 'X')
2510 number[1] = 'x';
2511 else if (number[1] != 'x') {
2512 memmove(number + 2, number,
2513 strlen(number) + 1);
2514 number[0] = '0';
2515 number[1] = 'x';
2516 len += 2;
2517 }
2518
2519 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2520 return NULL;
2521 break;
2522 }
2523
2524 case 's':
2525 {
2526 /* UTF-8 */
2527 const char *s = va_arg(*vargs, const char*);
2528 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2529 if (!str)
2530 return NULL;
2531 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2532 Py_DECREF(str);
2533 return NULL;
2534 }
2535 Py_DECREF(str);
2536 break;
2537 }
2538
2539 case 'U':
2540 {
2541 PyObject *obj = va_arg(*vargs, PyObject *);
2542 assert(obj && _PyUnicode_CHECK(obj));
2543
2544 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2545 return NULL;
2546 break;
2547 }
2548
2549 case 'V':
2550 {
2551 PyObject *obj = va_arg(*vargs, PyObject *);
2552 const char *str = va_arg(*vargs, const char *);
2553 PyObject *str_obj;
2554 assert(obj || str);
2555 if (obj) {
2556 assert(_PyUnicode_CHECK(obj));
2557 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2558 return NULL;
2559 }
2560 else {
2561 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2562 if (!str_obj)
2563 return NULL;
2564 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2565 Py_DECREF(str_obj);
2566 return NULL;
2567 }
2568 Py_DECREF(str_obj);
2569 }
2570 break;
2571 }
2572
2573 case 'S':
2574 {
2575 PyObject *obj = va_arg(*vargs, PyObject *);
2576 PyObject *str;
2577 assert(obj);
2578 str = PyObject_Str(obj);
2579 if (!str)
2580 return NULL;
2581 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2582 Py_DECREF(str);
2583 return NULL;
2584 }
2585 Py_DECREF(str);
2586 break;
2587 }
2588
2589 case 'R':
2590 {
2591 PyObject *obj = va_arg(*vargs, PyObject *);
2592 PyObject *repr;
2593 assert(obj);
2594 repr = PyObject_Repr(obj);
2595 if (!repr)
2596 return NULL;
2597 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2598 Py_DECREF(repr);
2599 return NULL;
2600 }
2601 Py_DECREF(repr);
2602 break;
2603 }
2604
2605 case 'A':
2606 {
2607 PyObject *obj = va_arg(*vargs, PyObject *);
2608 PyObject *ascii;
2609 assert(obj);
2610 ascii = PyObject_ASCII(obj);
2611 if (!ascii)
2612 return NULL;
2613 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2614 Py_DECREF(ascii);
2615 return NULL;
2616 }
2617 Py_DECREF(ascii);
2618 break;
2619 }
2620
2621 case '%':
2622 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2623 return NULL;
2624 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2625 writer->pos++;
2626 break;
2627
2628 default:
2629 /* if we stumble upon an unknown formatting code, copy the rest
2630 of the format string to the output string. (we cannot just
2631 skip the code, since there's no way to know what's in the
2632 argument list) */
2633 len = strlen(p);
2634 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2635 return NULL;
2636 f = p+len;
2637 return f;
2638 }
2639
2640 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002641 return f;
2642}
2643
Walter Dörwaldd2034312007-05-18 16:29:38 +00002644PyObject *
2645PyUnicode_FromFormatV(const char *format, va_list vargs)
2646{
Victor Stinnere215d962012-10-06 23:03:36 +02002647 va_list vargs2;
2648 const char *f;
2649 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002650
Victor Stinnere215d962012-10-06 23:03:36 +02002651 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2652
2653 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2654 Copy it to be able to pass a reference to a subfunction. */
2655 Py_VA_COPY(vargs2, vargs);
2656
2657 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002658 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002659 f = unicode_fromformat_arg(&writer, f, &vargs2);
2660 if (f == NULL)
2661 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002664 const char *p;
2665 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002666
Victor Stinnere215d962012-10-06 23:03:36 +02002667 p = f;
2668 do
2669 {
2670 if ((unsigned char)*p > 127) {
2671 PyErr_Format(PyExc_ValueError,
2672 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2673 "string, got a non-ASCII byte: 0x%02x",
2674 (unsigned char)*p);
2675 return NULL;
2676 }
2677 p++;
2678 }
2679 while (*p != '\0' && *p != '%');
2680 len = p - f;
2681
2682 if (*p == '\0')
2683 writer.overallocate = 0;
2684 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2685 goto fail;
2686 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2687 writer.pos += len;
2688
2689 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002690 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002691 }
Victor Stinnere215d962012-10-06 23:03:36 +02002692 return _PyUnicodeWriter_Finish(&writer);
2693
2694 fail:
2695 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002697}
2698
Walter Dörwaldd2034312007-05-18 16:29:38 +00002699PyObject *
2700PyUnicode_FromFormat(const char *format, ...)
2701{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 PyObject* ret;
2703 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002704
2705#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002709#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 ret = PyUnicode_FromFormatV(format, vargs);
2711 va_end(vargs);
2712 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713}
2714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715#ifdef HAVE_WCHAR_H
2716
Victor Stinner5593d8a2010-10-02 11:11:27 +00002717/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2718 convert a Unicode object to a wide character string.
2719
Victor Stinnerd88d9832011-09-06 02:00:05 +02002720 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002721 character) required to convert the unicode object. Ignore size argument.
2722
Victor Stinnerd88d9832011-09-06 02:00:05 +02002723 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002724 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002725 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002726static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002727unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002728 wchar_t *w,
2729 Py_ssize_t size)
2730{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732 const wchar_t *wstr;
2733
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002734 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 if (wstr == NULL)
2736 return -1;
2737
Victor Stinner5593d8a2010-10-02 11:11:27 +00002738 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002739 if (size > res)
2740 size = res + 1;
2741 else
2742 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002744 return res;
2745 }
2746 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002748}
2749
2750Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002751PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002752 wchar_t *w,
2753 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754{
2755 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002756 PyErr_BadInternalCall();
2757 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002759 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760}
2761
Victor Stinner137c34c2010-09-29 10:25:54 +00002762wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002763PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002764 Py_ssize_t *size)
2765{
2766 wchar_t* buffer;
2767 Py_ssize_t buflen;
2768
2769 if (unicode == NULL) {
2770 PyErr_BadInternalCall();
2771 return NULL;
2772 }
2773
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002774 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 if (buflen == -1)
2776 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002777 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002778 PyErr_NoMemory();
2779 return NULL;
2780 }
2781
Victor Stinner137c34c2010-09-29 10:25:54 +00002782 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2783 if (buffer == NULL) {
2784 PyErr_NoMemory();
2785 return NULL;
2786 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002787 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002788 if (buflen == -1) {
2789 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002790 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002791 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002792 if (size != NULL)
2793 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002794 return buffer;
2795}
2796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002797#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798
Alexander Belopolsky40018472011-02-26 01:02:56 +00002799PyObject *
2800PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002803 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002804 PyErr_SetString(PyExc_ValueError,
2805 "chr() arg not in range(0x110000)");
2806 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002807 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002809 if (ordinal < 256)
2810 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002812 v = PyUnicode_New(1, ordinal);
2813 if (v == NULL)
2814 return NULL;
2815 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002816 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002817 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002818}
2819
Alexander Belopolsky40018472011-02-26 01:02:56 +00002820PyObject *
2821PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002823 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002824 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002826 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002827 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002828 Py_INCREF(obj);
2829 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002830 }
2831 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 /* For a Unicode subtype that's not a Unicode object,
2833 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002834 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002835 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002836 PyErr_Format(PyExc_TypeError,
2837 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002838 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002839 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002840}
2841
Alexander Belopolsky40018472011-02-26 01:02:56 +00002842PyObject *
2843PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002844 const char *encoding,
2845 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002846{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002847 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002848 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002849
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002851 PyErr_BadInternalCall();
2852 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002854
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002855 /* Decoding bytes objects is the most common case and should be fast */
2856 if (PyBytes_Check(obj)) {
2857 if (PyBytes_GET_SIZE(obj) == 0) {
2858 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002859 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002860 }
2861 else {
2862 v = PyUnicode_Decode(
2863 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2864 encoding, errors);
2865 }
2866 return v;
2867 }
2868
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002869 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002870 PyErr_SetString(PyExc_TypeError,
2871 "decoding str is not supported");
2872 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002873 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002874
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002875 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2876 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2877 PyErr_Format(PyExc_TypeError,
2878 "coercing to str: need bytes, bytearray "
2879 "or buffer-like object, %.80s found",
2880 Py_TYPE(obj)->tp_name);
2881 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002882 }
Tim Petersced69f82003-09-16 20:30:58 +00002883
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002884 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002886 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 }
Tim Petersced69f82003-09-16 20:30:58 +00002888 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002889 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002890
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002891 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002892 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893}
2894
Victor Stinner600d3be2010-06-10 12:00:55 +00002895/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002896 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2897 1 on success. */
2898static int
2899normalize_encoding(const char *encoding,
2900 char *lower,
2901 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002903 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002904 char *l;
2905 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002906
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002907 if (encoding == NULL) {
2908 strcpy(lower, "utf-8");
2909 return 1;
2910 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002911 e = encoding;
2912 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002913 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002914 while (*e) {
2915 if (l == l_end)
2916 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002917 if (Py_ISUPPER(*e)) {
2918 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002919 }
2920 else if (*e == '_') {
2921 *l++ = '-';
2922 e++;
2923 }
2924 else {
2925 *l++ = *e++;
2926 }
2927 }
2928 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002929 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002930}
2931
Alexander Belopolsky40018472011-02-26 01:02:56 +00002932PyObject *
2933PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002934 Py_ssize_t size,
2935 const char *encoding,
2936 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002937{
2938 PyObject *buffer = NULL, *unicode;
2939 Py_buffer info;
2940 char lower[11]; /* Enough for any encoding shortcut */
2941
Fred Drakee4315f52000-05-09 19:53:39 +00002942 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002943 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002944 if ((strcmp(lower, "utf-8") == 0) ||
2945 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002946 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002947 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002948 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002949 (strcmp(lower, "iso-8859-1") == 0))
2950 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002951#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002952 else if (strcmp(lower, "mbcs") == 0)
2953 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002954#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002955 else if (strcmp(lower, "ascii") == 0)
2956 return PyUnicode_DecodeASCII(s, size, errors);
2957 else if (strcmp(lower, "utf-16") == 0)
2958 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2959 else if (strcmp(lower, "utf-32") == 0)
2960 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002962
2963 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002964 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002965 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002966 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002967 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968 if (buffer == NULL)
2969 goto onError;
2970 unicode = PyCodec_Decode(buffer, encoding, errors);
2971 if (unicode == NULL)
2972 goto onError;
2973 if (!PyUnicode_Check(unicode)) {
2974 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002975 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002976 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 Py_DECREF(unicode);
2978 goto onError;
2979 }
2980 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002981 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002982
Benjamin Peterson29060642009-01-31 22:14:21 +00002983 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 Py_XDECREF(buffer);
2985 return NULL;
2986}
2987
Alexander Belopolsky40018472011-02-26 01:02:56 +00002988PyObject *
2989PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002990 const char *encoding,
2991 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002992{
2993 PyObject *v;
2994
2995 if (!PyUnicode_Check(unicode)) {
2996 PyErr_BadArgument();
2997 goto onError;
2998 }
2999
3000 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003001 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003002
3003 /* Decode via the codec registry */
3004 v = PyCodec_Decode(unicode, encoding, errors);
3005 if (v == NULL)
3006 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003007 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003008
Benjamin Peterson29060642009-01-31 22:14:21 +00003009 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003010 return NULL;
3011}
3012
Alexander Belopolsky40018472011-02-26 01:02:56 +00003013PyObject *
3014PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003015 const char *encoding,
3016 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003017{
3018 PyObject *v;
3019
3020 if (!PyUnicode_Check(unicode)) {
3021 PyErr_BadArgument();
3022 goto onError;
3023 }
3024
3025 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003027
3028 /* Decode via the codec registry */
3029 v = PyCodec_Decode(unicode, encoding, errors);
3030 if (v == NULL)
3031 goto onError;
3032 if (!PyUnicode_Check(v)) {
3033 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003034 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003035 Py_TYPE(v)->tp_name);
3036 Py_DECREF(v);
3037 goto onError;
3038 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003039 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003040
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003042 return NULL;
3043}
3044
Alexander Belopolsky40018472011-02-26 01:02:56 +00003045PyObject *
3046PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003047 Py_ssize_t size,
3048 const char *encoding,
3049 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050{
3051 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003052
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 unicode = PyUnicode_FromUnicode(s, size);
3054 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003055 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3057 Py_DECREF(unicode);
3058 return v;
3059}
3060
Alexander Belopolsky40018472011-02-26 01:02:56 +00003061PyObject *
3062PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003063 const char *encoding,
3064 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003065{
3066 PyObject *v;
3067
3068 if (!PyUnicode_Check(unicode)) {
3069 PyErr_BadArgument();
3070 goto onError;
3071 }
3072
3073 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003074 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003075
3076 /* Encode via the codec registry */
3077 v = PyCodec_Encode(unicode, encoding, errors);
3078 if (v == NULL)
3079 goto onError;
3080 return v;
3081
Benjamin Peterson29060642009-01-31 22:14:21 +00003082 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003083 return NULL;
3084}
3085
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003086static size_t
3087wcstombs_errorpos(const wchar_t *wstr)
3088{
3089 size_t len;
3090#if SIZEOF_WCHAR_T == 2
3091 wchar_t buf[3];
3092#else
3093 wchar_t buf[2];
3094#endif
3095 char outbuf[MB_LEN_MAX];
3096 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003097
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003098#if SIZEOF_WCHAR_T == 2
3099 buf[2] = 0;
3100#else
3101 buf[1] = 0;
3102#endif
3103 start = wstr;
3104 while (*wstr != L'\0')
3105 {
3106 previous = wstr;
3107#if SIZEOF_WCHAR_T == 2
3108 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3109 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3110 {
3111 buf[0] = wstr[0];
3112 buf[1] = wstr[1];
3113 wstr += 2;
3114 }
3115 else {
3116 buf[0] = *wstr;
3117 buf[1] = 0;
3118 wstr++;
3119 }
3120#else
3121 buf[0] = *wstr;
3122 wstr++;
3123#endif
3124 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003125 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003126 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003127 }
3128
3129 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003130 return 0;
3131}
3132
Victor Stinner1b579672011-12-17 05:47:23 +01003133static int
3134locale_error_handler(const char *errors, int *surrogateescape)
3135{
3136 if (errors == NULL) {
3137 *surrogateescape = 0;
3138 return 0;
3139 }
3140
3141 if (strcmp(errors, "strict") == 0) {
3142 *surrogateescape = 0;
3143 return 0;
3144 }
3145 if (strcmp(errors, "surrogateescape") == 0) {
3146 *surrogateescape = 1;
3147 return 0;
3148 }
3149 PyErr_Format(PyExc_ValueError,
3150 "only 'strict' and 'surrogateescape' error handlers "
3151 "are supported, not '%s'",
3152 errors);
3153 return -1;
3154}
3155
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003156PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003157PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003158{
3159 Py_ssize_t wlen, wlen2;
3160 wchar_t *wstr;
3161 PyObject *bytes = NULL;
3162 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003163 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003164 PyObject *exc;
3165 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003166 int surrogateescape;
3167
3168 if (locale_error_handler(errors, &surrogateescape) < 0)
3169 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003170
3171 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3172 if (wstr == NULL)
3173 return NULL;
3174
3175 wlen2 = wcslen(wstr);
3176 if (wlen2 != wlen) {
3177 PyMem_Free(wstr);
3178 PyErr_SetString(PyExc_TypeError, "embedded null character");
3179 return NULL;
3180 }
3181
3182 if (surrogateescape) {
3183 /* locale encoding with surrogateescape */
3184 char *str;
3185
3186 str = _Py_wchar2char(wstr, &error_pos);
3187 if (str == NULL) {
3188 if (error_pos == (size_t)-1) {
3189 PyErr_NoMemory();
3190 PyMem_Free(wstr);
3191 return NULL;
3192 }
3193 else {
3194 goto encode_error;
3195 }
3196 }
3197 PyMem_Free(wstr);
3198
3199 bytes = PyBytes_FromString(str);
3200 PyMem_Free(str);
3201 }
3202 else {
3203 size_t len, len2;
3204
3205 len = wcstombs(NULL, wstr, 0);
3206 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003207 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003208 goto encode_error;
3209 }
3210
3211 bytes = PyBytes_FromStringAndSize(NULL, len);
3212 if (bytes == NULL) {
3213 PyMem_Free(wstr);
3214 return NULL;
3215 }
3216
3217 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3218 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003219 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003220 goto encode_error;
3221 }
3222 PyMem_Free(wstr);
3223 }
3224 return bytes;
3225
3226encode_error:
3227 errmsg = strerror(errno);
3228 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003229
3230 if (error_pos == (size_t)-1)
3231 error_pos = wcstombs_errorpos(wstr);
3232
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003233 PyMem_Free(wstr);
3234 Py_XDECREF(bytes);
3235
Victor Stinner2f197072011-12-17 07:08:30 +01003236 if (errmsg != NULL) {
3237 size_t errlen;
3238 wstr = _Py_char2wchar(errmsg, &errlen);
3239 if (wstr != NULL) {
3240 reason = PyUnicode_FromWideChar(wstr, errlen);
3241 PyMem_Free(wstr);
3242 } else
3243 errmsg = NULL;
3244 }
3245 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003246 reason = PyUnicode_FromString(
3247 "wcstombs() encountered an unencodable "
3248 "wide character");
3249 if (reason == NULL)
3250 return NULL;
3251
3252 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3253 "locale", unicode,
3254 (Py_ssize_t)error_pos,
3255 (Py_ssize_t)(error_pos+1),
3256 reason);
3257 Py_DECREF(reason);
3258 if (exc != NULL) {
3259 PyCodec_StrictErrors(exc);
3260 Py_XDECREF(exc);
3261 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003262 return NULL;
3263}
3264
Victor Stinnerad158722010-10-27 00:25:46 +00003265PyObject *
3266PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003267{
Victor Stinner99b95382011-07-04 14:23:54 +02003268#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003269 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003270#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003271 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003272#else
Victor Stinner793b5312011-04-27 00:24:21 +02003273 PyInterpreterState *interp = PyThreadState_GET()->interp;
3274 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3275 cannot use it to encode and decode filenames before it is loaded. Load
3276 the Python codec requires to encode at least its own filename. Use the C
3277 version of the locale codec until the codec registry is initialized and
3278 the Python codec is loaded.
3279
3280 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3281 cannot only rely on it: check also interp->fscodec_initialized for
3282 subinterpreters. */
3283 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003284 return PyUnicode_AsEncodedString(unicode,
3285 Py_FileSystemDefaultEncoding,
3286 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003287 }
3288 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003289 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003290 }
Victor Stinnerad158722010-10-27 00:25:46 +00003291#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003292}
3293
Alexander Belopolsky40018472011-02-26 01:02:56 +00003294PyObject *
3295PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003296 const char *encoding,
3297 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298{
3299 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003300 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003301
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 if (!PyUnicode_Check(unicode)) {
3303 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 }
Fred Drakee4315f52000-05-09 19:53:39 +00003306
Fred Drakee4315f52000-05-09 19:53:39 +00003307 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003308 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003309 if ((strcmp(lower, "utf-8") == 0) ||
3310 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003311 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003312 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003313 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003314 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003315 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003316 }
Victor Stinner37296e82010-06-10 13:36:23 +00003317 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003318 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003319 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003320 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003321#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003322 else if (strcmp(lower, "mbcs") == 0)
3323 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003324#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003325 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003326 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003327 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328
3329 /* Encode via the codec registry */
3330 v = PyCodec_Encode(unicode, encoding, errors);
3331 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003332 return NULL;
3333
3334 /* The normal path */
3335 if (PyBytes_Check(v))
3336 return v;
3337
3338 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003339 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003340 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003341 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003342
3343 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3344 "encoder %s returned bytearray instead of bytes",
3345 encoding);
3346 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003347 Py_DECREF(v);
3348 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003349 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003350
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003351 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3352 Py_DECREF(v);
3353 return b;
3354 }
3355
3356 PyErr_Format(PyExc_TypeError,
3357 "encoder did not return a bytes object (type=%.400s)",
3358 Py_TYPE(v)->tp_name);
3359 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003360 return NULL;
3361}
3362
Alexander Belopolsky40018472011-02-26 01:02:56 +00003363PyObject *
3364PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003365 const char *encoding,
3366 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003367{
3368 PyObject *v;
3369
3370 if (!PyUnicode_Check(unicode)) {
3371 PyErr_BadArgument();
3372 goto onError;
3373 }
3374
3375 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003376 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003377
3378 /* Encode via the codec registry */
3379 v = PyCodec_Encode(unicode, encoding, errors);
3380 if (v == NULL)
3381 goto onError;
3382 if (!PyUnicode_Check(v)) {
3383 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003384 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003385 Py_TYPE(v)->tp_name);
3386 Py_DECREF(v);
3387 goto onError;
3388 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003390
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392 return NULL;
3393}
3394
Victor Stinner2f197072011-12-17 07:08:30 +01003395static size_t
3396mbstowcs_errorpos(const char *str, size_t len)
3397{
3398#ifdef HAVE_MBRTOWC
3399 const char *start = str;
3400 mbstate_t mbs;
3401 size_t converted;
3402 wchar_t ch;
3403
3404 memset(&mbs, 0, sizeof mbs);
3405 while (len)
3406 {
3407 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3408 if (converted == 0)
3409 /* Reached end of string */
3410 break;
3411 if (converted == (size_t)-1 || converted == (size_t)-2) {
3412 /* Conversion error or incomplete character */
3413 return str - start;
3414 }
3415 else {
3416 str += converted;
3417 len -= converted;
3418 }
3419 }
3420 /* failed to find the undecodable byte sequence */
3421 return 0;
3422#endif
3423 return 0;
3424}
3425
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003426PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003427PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003428 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003429{
3430 wchar_t smallbuf[256];
3431 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3432 wchar_t *wstr;
3433 size_t wlen, wlen2;
3434 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003435 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003436 size_t error_pos;
3437 char *errmsg;
3438 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003439
3440 if (locale_error_handler(errors, &surrogateescape) < 0)
3441 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003442
3443 if (str[len] != '\0' || len != strlen(str)) {
3444 PyErr_SetString(PyExc_TypeError, "embedded null character");
3445 return NULL;
3446 }
3447
3448 if (surrogateescape)
3449 {
3450 wstr = _Py_char2wchar(str, &wlen);
3451 if (wstr == NULL) {
3452 if (wlen == (size_t)-1)
3453 PyErr_NoMemory();
3454 else
3455 PyErr_SetFromErrno(PyExc_OSError);
3456 return NULL;
3457 }
3458
3459 unicode = PyUnicode_FromWideChar(wstr, wlen);
3460 PyMem_Free(wstr);
3461 }
3462 else {
3463#ifndef HAVE_BROKEN_MBSTOWCS
3464 wlen = mbstowcs(NULL, str, 0);
3465#else
3466 wlen = len;
3467#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003468 if (wlen == (size_t)-1)
3469 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003470 if (wlen+1 <= smallbuf_len) {
3471 wstr = smallbuf;
3472 }
3473 else {
3474 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3475 return PyErr_NoMemory();
3476
3477 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3478 if (!wstr)
3479 return PyErr_NoMemory();
3480 }
3481
3482 /* This shouldn't fail now */
3483 wlen2 = mbstowcs(wstr, str, wlen+1);
3484 if (wlen2 == (size_t)-1) {
3485 if (wstr != smallbuf)
3486 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003487 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003488 }
3489#ifdef HAVE_BROKEN_MBSTOWCS
3490 assert(wlen2 == wlen);
3491#endif
3492 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3493 if (wstr != smallbuf)
3494 PyMem_Free(wstr);
3495 }
3496 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003497
3498decode_error:
3499 errmsg = strerror(errno);
3500 assert(errmsg != NULL);
3501
3502 error_pos = mbstowcs_errorpos(str, len);
3503 if (errmsg != NULL) {
3504 size_t errlen;
3505 wstr = _Py_char2wchar(errmsg, &errlen);
3506 if (wstr != NULL) {
3507 reason = PyUnicode_FromWideChar(wstr, errlen);
3508 PyMem_Free(wstr);
3509 } else
3510 errmsg = NULL;
3511 }
3512 if (errmsg == NULL)
3513 reason = PyUnicode_FromString(
3514 "mbstowcs() encountered an invalid multibyte sequence");
3515 if (reason == NULL)
3516 return NULL;
3517
3518 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3519 "locale", str, len,
3520 (Py_ssize_t)error_pos,
3521 (Py_ssize_t)(error_pos+1),
3522 reason);
3523 Py_DECREF(reason);
3524 if (exc != NULL) {
3525 PyCodec_StrictErrors(exc);
3526 Py_XDECREF(exc);
3527 }
3528 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003529}
3530
3531PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003532PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003533{
3534 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003535 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003536}
3537
3538
3539PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003540PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003541 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003542 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3543}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003544
Christian Heimes5894ba72007-11-04 11:43:14 +00003545PyObject*
3546PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3547{
Victor Stinner99b95382011-07-04 14:23:54 +02003548#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003549 return PyUnicode_DecodeMBCS(s, size, NULL);
3550#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003551 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003552#else
Victor Stinner793b5312011-04-27 00:24:21 +02003553 PyInterpreterState *interp = PyThreadState_GET()->interp;
3554 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3555 cannot use it to encode and decode filenames before it is loaded. Load
3556 the Python codec requires to encode at least its own filename. Use the C
3557 version of the locale codec until the codec registry is initialized and
3558 the Python codec is loaded.
3559
3560 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3561 cannot only rely on it: check also interp->fscodec_initialized for
3562 subinterpreters. */
3563 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003564 return PyUnicode_Decode(s, size,
3565 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003566 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003567 }
3568 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003569 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003570 }
Victor Stinnerad158722010-10-27 00:25:46 +00003571#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003572}
3573
Martin v. Löwis011e8422009-05-05 04:43:17 +00003574
3575int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003576_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003577{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003578 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003579
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003580 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003581 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003582 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3583 PyUnicode_GET_LENGTH(str), '\0', 1);
3584 if (pos == -1)
3585 return 0;
3586 else
3587 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003588}
3589
Antoine Pitrou13348842012-01-29 18:36:34 +01003590int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003591PyUnicode_FSConverter(PyObject* arg, void* addr)
3592{
3593 PyObject *output = NULL;
3594 Py_ssize_t size;
3595 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003596 if (arg == NULL) {
3597 Py_DECREF(*(PyObject**)addr);
3598 return 1;
3599 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003600 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003601 output = arg;
3602 Py_INCREF(output);
3603 }
3604 else {
3605 arg = PyUnicode_FromObject(arg);
3606 if (!arg)
3607 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003608 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003609 Py_DECREF(arg);
3610 if (!output)
3611 return 0;
3612 if (!PyBytes_Check(output)) {
3613 Py_DECREF(output);
3614 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3615 return 0;
3616 }
3617 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003618 size = PyBytes_GET_SIZE(output);
3619 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003620 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003621 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003622 Py_DECREF(output);
3623 return 0;
3624 }
3625 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003626 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003627}
3628
3629
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003630int
3631PyUnicode_FSDecoder(PyObject* arg, void* addr)
3632{
3633 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003634 if (arg == NULL) {
3635 Py_DECREF(*(PyObject**)addr);
3636 return 1;
3637 }
3638 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003639 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003640 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003641 output = arg;
3642 Py_INCREF(output);
3643 }
3644 else {
3645 arg = PyBytes_FromObject(arg);
3646 if (!arg)
3647 return 0;
3648 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3649 PyBytes_GET_SIZE(arg));
3650 Py_DECREF(arg);
3651 if (!output)
3652 return 0;
3653 if (!PyUnicode_Check(output)) {
3654 Py_DECREF(output);
3655 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3656 return 0;
3657 }
3658 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003659 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003660 Py_DECREF(output);
3661 return 0;
3662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003663 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003664 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003665 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3666 Py_DECREF(output);
3667 return 0;
3668 }
3669 *(PyObject**)addr = output;
3670 return Py_CLEANUP_SUPPORTED;
3671}
3672
3673
Martin v. Löwis5b222132007-06-10 09:51:05 +00003674char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003675PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003676{
Christian Heimesf3863112007-11-22 07:46:41 +00003677 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003678
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003679 if (!PyUnicode_Check(unicode)) {
3680 PyErr_BadArgument();
3681 return NULL;
3682 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003683 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003684 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003685
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003686 if (PyUnicode_UTF8(unicode) == NULL) {
3687 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003688 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3689 if (bytes == NULL)
3690 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003691 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3692 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003693 Py_DECREF(bytes);
3694 return NULL;
3695 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003696 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3697 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3698 PyBytes_AS_STRING(bytes),
3699 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003700 Py_DECREF(bytes);
3701 }
3702
3703 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003704 *psize = PyUnicode_UTF8_LENGTH(unicode);
3705 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003706}
3707
3708char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003710{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3712}
3713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003714Py_UNICODE *
3715PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3716{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 const unsigned char *one_byte;
3718#if SIZEOF_WCHAR_T == 4
3719 const Py_UCS2 *two_bytes;
3720#else
3721 const Py_UCS4 *four_bytes;
3722 const Py_UCS4 *ucs4_end;
3723 Py_ssize_t num_surrogates;
3724#endif
3725 wchar_t *w;
3726 wchar_t *wchar_end;
3727
3728 if (!PyUnicode_Check(unicode)) {
3729 PyErr_BadArgument();
3730 return NULL;
3731 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003732 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003733 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003734 assert(_PyUnicode_KIND(unicode) != 0);
3735 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003736
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003737 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003738#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003739 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3740 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003741 num_surrogates = 0;
3742
3743 for (; four_bytes < ucs4_end; ++four_bytes) {
3744 if (*four_bytes > 0xFFFF)
3745 ++num_surrogates;
3746 }
3747
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003748 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3749 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3750 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003751 PyErr_NoMemory();
3752 return NULL;
3753 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003754 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003756 w = _PyUnicode_WSTR(unicode);
3757 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3758 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003759 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3760 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003761 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003763 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3764 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 }
3766 else
3767 *w = *four_bytes;
3768
3769 if (w > wchar_end) {
3770 assert(0 && "Miscalculated string end");
3771 }
3772 }
3773 *w = 0;
3774#else
3775 /* sizeof(wchar_t) == 4 */
3776 Py_FatalError("Impossible unicode object state, wstr and str "
3777 "should share memory already.");
3778 return NULL;
3779#endif
3780 }
3781 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003782 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3783 (_PyUnicode_LENGTH(unicode) + 1));
3784 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785 PyErr_NoMemory();
3786 return NULL;
3787 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003788 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3789 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3790 w = _PyUnicode_WSTR(unicode);
3791 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003792
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003793 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3794 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003795 for (; w < wchar_end; ++one_byte, ++w)
3796 *w = *one_byte;
3797 /* null-terminate the wstr */
3798 *w = 0;
3799 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003800 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003801#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003802 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803 for (; w < wchar_end; ++two_bytes, ++w)
3804 *w = *two_bytes;
3805 /* null-terminate the wstr */
3806 *w = 0;
3807#else
3808 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003809 PyObject_FREE(_PyUnicode_WSTR(unicode));
3810 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 Py_FatalError("Impossible unicode object state, wstr "
3812 "and str should share memory already.");
3813 return NULL;
3814#endif
3815 }
3816 else {
3817 assert(0 && "This should never happen.");
3818 }
3819 }
3820 }
3821 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003822 *size = PyUnicode_WSTR_LENGTH(unicode);
3823 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003824}
3825
Alexander Belopolsky40018472011-02-26 01:02:56 +00003826Py_UNICODE *
3827PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830}
3831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832
Alexander Belopolsky40018472011-02-26 01:02:56 +00003833Py_ssize_t
3834PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835{
3836 if (!PyUnicode_Check(unicode)) {
3837 PyErr_BadArgument();
3838 goto onError;
3839 }
3840 return PyUnicode_GET_SIZE(unicode);
3841
Benjamin Peterson29060642009-01-31 22:14:21 +00003842 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 return -1;
3844}
3845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003846Py_ssize_t
3847PyUnicode_GetLength(PyObject *unicode)
3848{
Victor Stinner07621332012-06-16 04:53:46 +02003849 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003850 PyErr_BadArgument();
3851 return -1;
3852 }
Victor Stinner07621332012-06-16 04:53:46 +02003853 if (PyUnicode_READY(unicode) == -1)
3854 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003855 return PyUnicode_GET_LENGTH(unicode);
3856}
3857
3858Py_UCS4
3859PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3860{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003861 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3862 PyErr_BadArgument();
3863 return (Py_UCS4)-1;
3864 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003865 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003866 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867 return (Py_UCS4)-1;
3868 }
3869 return PyUnicode_READ_CHAR(unicode, index);
3870}
3871
3872int
3873PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3874{
3875 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003876 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 return -1;
3878 }
Victor Stinner488fa492011-12-12 00:01:39 +01003879 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003880 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003881 PyErr_SetString(PyExc_IndexError, "string index out of range");
3882 return -1;
3883 }
Victor Stinner488fa492011-12-12 00:01:39 +01003884 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003885 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003886 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3887 PyErr_SetString(PyExc_ValueError, "character out of range");
3888 return -1;
3889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3891 index, ch);
3892 return 0;
3893}
3894
Alexander Belopolsky40018472011-02-26 01:02:56 +00003895const char *
3896PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003897{
Victor Stinner42cb4622010-09-01 19:39:01 +00003898 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003899}
3900
Victor Stinner554f3f02010-06-16 23:33:54 +00003901/* create or adjust a UnicodeDecodeError */
3902static void
3903make_decode_exception(PyObject **exceptionObject,
3904 const char *encoding,
3905 const char *input, Py_ssize_t length,
3906 Py_ssize_t startpos, Py_ssize_t endpos,
3907 const char *reason)
3908{
3909 if (*exceptionObject == NULL) {
3910 *exceptionObject = PyUnicodeDecodeError_Create(
3911 encoding, input, length, startpos, endpos, reason);
3912 }
3913 else {
3914 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3915 goto onError;
3916 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3917 goto onError;
3918 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3919 goto onError;
3920 }
3921 return;
3922
3923onError:
3924 Py_DECREF(*exceptionObject);
3925 *exceptionObject = NULL;
3926}
3927
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928/* error handling callback helper:
3929 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003930 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 and adjust various state variables.
3932 return 0 on success, -1 on error
3933*/
3934
Alexander Belopolsky40018472011-02-26 01:02:56 +00003935static int
3936unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003937 const char *encoding, const char *reason,
3938 const char **input, const char **inend, Py_ssize_t *startinpos,
3939 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003940 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003941{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003942 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003943
3944 PyObject *restuple = NULL;
3945 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003946 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003947 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003948 Py_ssize_t requiredsize;
3949 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003950 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 int res = -1;
3952
Victor Stinner596a6c42011-11-09 00:02:18 +01003953 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3954 outsize = PyUnicode_GET_LENGTH(*output);
3955 else
3956 outsize = _PyUnicode_WSTR_LENGTH(*output);
3957
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003959 *errorHandler = PyCodec_LookupError(errors);
3960 if (*errorHandler == NULL)
3961 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 }
3963
Victor Stinner554f3f02010-06-16 23:33:54 +00003964 make_decode_exception(exceptionObject,
3965 encoding,
3966 *input, *inend - *input,
3967 *startinpos, *endinpos,
3968 reason);
3969 if (*exceptionObject == NULL)
3970 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971
3972 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3973 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003974 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003975 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003976 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003977 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978 }
3979 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05003981 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003982 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003983
3984 /* Copy back the bytes variables, which might have been modified by the
3985 callback */
3986 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3987 if (!inputobj)
3988 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003989 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003991 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003992 *input = PyBytes_AS_STRING(inputobj);
3993 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003994 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003995 /* we can DECREF safely, as the exception has another reference,
3996 so the object won't go away. */
3997 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003998
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003999 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004000 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004001 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4003 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004004 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005
Victor Stinner596a6c42011-11-09 00:02:18 +01004006 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4007 /* need more space? (at least enough for what we
4008 have+the replacement+the rest of the string (starting
4009 at the new input position), so we won't have to check space
4010 when there are no errors in the rest of the string) */
4011 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4012 requiredsize = *outpos + replen + insize-newpos;
4013 if (requiredsize > outsize) {
4014 if (requiredsize<2*outsize)
4015 requiredsize = 2*outsize;
4016 if (unicode_resize(output, requiredsize) < 0)
4017 goto onError;
4018 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004019 if (unicode_widen(output, *outpos,
4020 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004021 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004022 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004023 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004025 else {
4026 wchar_t *repwstr;
4027 Py_ssize_t repwlen;
4028 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4029 if (repwstr == NULL)
4030 goto onError;
4031 /* need more space? (at least enough for what we
4032 have+the replacement+the rest of the string (starting
4033 at the new input position), so we won't have to check space
4034 when there are no errors in the rest of the string) */
4035 requiredsize = *outpos + repwlen + insize-newpos;
4036 if (requiredsize > outsize) {
4037 if (requiredsize < 2*outsize)
4038 requiredsize = 2*outsize;
4039 if (unicode_resize(output, requiredsize) < 0)
4040 goto onError;
4041 }
4042 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4043 *outpos += repwlen;
4044 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004046 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004048 /* we made it! */
4049 res = 0;
4050
Benjamin Peterson29060642009-01-31 22:14:21 +00004051 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052 Py_XDECREF(restuple);
4053 return res;
4054}
4055
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004056/* --- UTF-7 Codec -------------------------------------------------------- */
4057
Antoine Pitrou244651a2009-05-04 18:56:13 +00004058/* See RFC2152 for details. We encode conservatively and decode liberally. */
4059
4060/* Three simple macros defining base-64. */
4061
4062/* Is c a base-64 character? */
4063
4064#define IS_BASE64(c) \
4065 (((c) >= 'A' && (c) <= 'Z') || \
4066 ((c) >= 'a' && (c) <= 'z') || \
4067 ((c) >= '0' && (c) <= '9') || \
4068 (c) == '+' || (c) == '/')
4069
4070/* given that c is a base-64 character, what is its base-64 value? */
4071
4072#define FROM_BASE64(c) \
4073 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4074 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4075 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4076 (c) == '+' ? 62 : 63)
4077
4078/* What is the base-64 character of the bottom 6 bits of n? */
4079
4080#define TO_BASE64(n) \
4081 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4082
4083/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4084 * decoded as itself. We are permissive on decoding; the only ASCII
4085 * byte not decoding to itself is the + which begins a base64
4086 * string. */
4087
4088#define DECODE_DIRECT(c) \
4089 ((c) <= 127 && (c) != '+')
4090
4091/* The UTF-7 encoder treats ASCII characters differently according to
4092 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4093 * the above). See RFC2152. This array identifies these different
4094 * sets:
4095 * 0 : "Set D"
4096 * alphanumeric and '(),-./:?
4097 * 1 : "Set O"
4098 * !"#$%&*;<=>@[]^_`{|}
4099 * 2 : "whitespace"
4100 * ht nl cr sp
4101 * 3 : special (must be base64 encoded)
4102 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4103 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004104
Tim Petersced69f82003-09-16 20:30:58 +00004105static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004106char utf7_category[128] = {
4107/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4108 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4109/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4110 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4111/* sp ! " # $ % & ' ( ) * + , - . / */
4112 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4113/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4115/* @ A B C D E F G H I J K L M N O */
4116 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4117/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4119/* ` a b c d e f g h i j k l m n o */
4120 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4121/* p q r s t u v w x y z { | } ~ del */
4122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004123};
4124
Antoine Pitrou244651a2009-05-04 18:56:13 +00004125/* ENCODE_DIRECT: this character should be encoded as itself. The
4126 * answer depends on whether we are encoding set O as itself, and also
4127 * on whether we are encoding whitespace as itself. RFC2152 makes it
4128 * clear that the answers to these questions vary between
4129 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004130
Antoine Pitrou244651a2009-05-04 18:56:13 +00004131#define ENCODE_DIRECT(c, directO, directWS) \
4132 ((c) < 128 && (c) > 0 && \
4133 ((utf7_category[(c)] == 0) || \
4134 (directWS && (utf7_category[(c)] == 2)) || \
4135 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004136
Alexander Belopolsky40018472011-02-26 01:02:56 +00004137PyObject *
4138PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004139 Py_ssize_t size,
4140 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004141{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004142 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4143}
4144
Antoine Pitrou244651a2009-05-04 18:56:13 +00004145/* The decoder. The only state we preserve is our read position,
4146 * i.e. how many characters we have consumed. So if we end in the
4147 * middle of a shift sequence we have to back off the read position
4148 * and the output to the beginning of the sequence, otherwise we lose
4149 * all the shift state (seen bits, number of bits seen, high
4150 * surrogate). */
4151
Alexander Belopolsky40018472011-02-26 01:02:56 +00004152PyObject *
4153PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004154 Py_ssize_t size,
4155 const char *errors,
4156 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004157{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004159 Py_ssize_t startinpos;
4160 Py_ssize_t endinpos;
4161 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004162 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004163 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004164 const char *errmsg = "";
4165 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004166 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004167 unsigned int base64bits = 0;
4168 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004169 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 PyObject *errorHandler = NULL;
4171 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004172
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004173 /* Start off assuming it's all ASCII. Widen later as necessary. */
4174 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004175 if (!unicode)
4176 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004177 if (size == 0) {
4178 if (consumed)
4179 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004180 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004181 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004182
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004183 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004184 e = s + size;
4185
4186 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004187 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004188 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004189 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004190
Antoine Pitrou244651a2009-05-04 18:56:13 +00004191 if (inShift) { /* in a base-64 section */
4192 if (IS_BASE64(ch)) { /* consume a base-64 character */
4193 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4194 base64bits += 6;
4195 s++;
4196 if (base64bits >= 16) {
4197 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004198 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004199 base64bits -= 16;
4200 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4201 if (surrogate) {
4202 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004203 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4204 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004205 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4206 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004207 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004208 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004209 }
4210 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004211 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4212 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004213 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004214 }
4215 }
Victor Stinner551ac952011-11-29 22:58:13 +01004216 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004217 /* first surrogate */
4218 surrogate = outCh;
4219 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004220 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004221 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4222 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004223 }
4224 }
4225 }
4226 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004227 inShift = 0;
4228 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004229 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004230 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4231 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004232 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004233 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004234 if (base64bits > 0) { /* left-over bits */
4235 if (base64bits >= 6) {
4236 /* We've seen at least one base-64 character */
4237 errmsg = "partial character in shift sequence";
4238 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004239 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004240 else {
4241 /* Some bits remain; they should be zero */
4242 if (base64buffer != 0) {
4243 errmsg = "non-zero padding bits in shift sequence";
4244 goto utf7Error;
4245 }
4246 }
4247 }
4248 if (ch != '-') {
4249 /* '-' is absorbed; other terminating
4250 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004251 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4252 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004253 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004254 }
4255 }
4256 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004258 s++; /* consume '+' */
4259 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004260 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004261 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4262 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004263 }
4264 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004265 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004266 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004267 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268 }
4269 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004270 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004271 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4272 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004273 s++;
4274 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004275 else {
4276 startinpos = s-starts;
4277 s++;
4278 errmsg = "unexpected special character";
4279 goto utf7Error;
4280 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004281 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004282utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 endinpos = s-starts;
4284 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004285 errors, &errorHandler,
4286 "utf7", errmsg,
4287 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004288 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004290 }
4291
Antoine Pitrou244651a2009-05-04 18:56:13 +00004292 /* end of string */
4293
4294 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4295 /* if we're in an inconsistent state, that's an error */
4296 if (surrogate ||
4297 (base64bits >= 6) ||
4298 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004299 endinpos = size;
4300 if (unicode_decode_call_errorhandler(
4301 errors, &errorHandler,
4302 "utf7", "unterminated shift sequence",
4303 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004304 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004305 goto onError;
4306 if (s < e)
4307 goto restart;
4308 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004309 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004310
4311 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004312 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004313 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004314 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004315 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004316 }
4317 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004318 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004320 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004321
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004322 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323 goto onError;
4324
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004325 Py_XDECREF(errorHandler);
4326 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004327 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004328
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004330 Py_XDECREF(errorHandler);
4331 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332 Py_DECREF(unicode);
4333 return NULL;
4334}
4335
4336
Alexander Belopolsky40018472011-02-26 01:02:56 +00004337PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004338_PyUnicode_EncodeUTF7(PyObject *str,
4339 int base64SetO,
4340 int base64WhiteSpace,
4341 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004342{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004343 int kind;
4344 void *data;
4345 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004346 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004347 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004348 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004349 unsigned int base64bits = 0;
4350 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351 char * out;
4352 char * start;
4353
Benjamin Petersonbac79492012-01-14 13:34:47 -05004354 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004355 return NULL;
4356 kind = PyUnicode_KIND(str);
4357 data = PyUnicode_DATA(str);
4358 len = PyUnicode_GET_LENGTH(str);
4359
4360 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004361 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004362
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004363 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004364 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004365 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004366 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004367 if (v == NULL)
4368 return NULL;
4369
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004370 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004371 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004372 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004373
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 if (inShift) {
4375 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4376 /* shifting out */
4377 if (base64bits) { /* output remaining bits */
4378 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4379 base64buffer = 0;
4380 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004381 }
4382 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383 /* Characters not in the BASE64 set implicitly unshift the sequence
4384 so no '-' is required, except if the character is itself a '-' */
4385 if (IS_BASE64(ch) || ch == '-') {
4386 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004387 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388 *out++ = (char) ch;
4389 }
4390 else {
4391 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004392 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004393 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394 else { /* not in a shift sequence */
4395 if (ch == '+') {
4396 *out++ = '+';
4397 *out++ = '-';
4398 }
4399 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4400 *out++ = (char) ch;
4401 }
4402 else {
4403 *out++ = '+';
4404 inShift = 1;
4405 goto encode_char;
4406 }
4407 }
4408 continue;
4409encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004410 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004411 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004412
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 /* code first surrogate */
4414 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004415 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 while (base64bits >= 6) {
4417 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4418 base64bits -= 6;
4419 }
4420 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004421 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004422 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 base64bits += 16;
4424 base64buffer = (base64buffer << 16) | ch;
4425 while (base64bits >= 6) {
4426 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4427 base64bits -= 6;
4428 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004429 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430 if (base64bits)
4431 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4432 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004433 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004434 if (_PyBytes_Resize(&v, out - start) < 0)
4435 return NULL;
4436 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004438PyObject *
4439PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4440 Py_ssize_t size,
4441 int base64SetO,
4442 int base64WhiteSpace,
4443 const char *errors)
4444{
4445 PyObject *result;
4446 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4447 if (tmp == NULL)
4448 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004449 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004450 base64WhiteSpace, errors);
4451 Py_DECREF(tmp);
4452 return result;
4453}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004454
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455#undef IS_BASE64
4456#undef FROM_BASE64
4457#undef TO_BASE64
4458#undef DECODE_DIRECT
4459#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004460
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461/* --- UTF-8 Codec -------------------------------------------------------- */
4462
Alexander Belopolsky40018472011-02-26 01:02:56 +00004463PyObject *
4464PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004465 Py_ssize_t size,
4466 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467{
Walter Dörwald69652032004-09-07 20:24:22 +00004468 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4469}
4470
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004471#include "stringlib/asciilib.h"
4472#include "stringlib/codecs.h"
4473#include "stringlib/undef.h"
4474
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004475#include "stringlib/ucs1lib.h"
4476#include "stringlib/codecs.h"
4477#include "stringlib/undef.h"
4478
4479#include "stringlib/ucs2lib.h"
4480#include "stringlib/codecs.h"
4481#include "stringlib/undef.h"
4482
4483#include "stringlib/ucs4lib.h"
4484#include "stringlib/codecs.h"
4485#include "stringlib/undef.h"
4486
Antoine Pitrouab868312009-01-10 15:40:25 +00004487/* Mask to quickly check whether a C 'long' contains a
4488 non-ASCII, UTF8-encoded char. */
4489#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004490# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004491#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004492# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004493#else
4494# error C 'long' size should be either 4 or 8!
4495#endif
4496
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004497static Py_ssize_t
4498ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004499{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004500 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004501 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004502
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004503#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004504 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4505 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004506 /* Fast path, see in STRINGLIB(utf8_decode) for
4507 an explanation. */
4508 /* Help register allocation */
4509 register const char *_p = p;
4510 register Py_UCS1 * q = dest;
4511 while (_p < aligned_end) {
4512 unsigned long value = *(const unsigned long *) _p;
4513 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004515 *((unsigned long *)q) = value;
4516 _p += SIZEOF_LONG;
4517 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004518 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004519 p = _p;
4520 while (p < end) {
4521 if ((unsigned char)*p & 0x80)
4522 break;
4523 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004525 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004527#endif
4528 while (p < end) {
4529 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4530 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004531 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004532 /* Help register allocation */
4533 register const char *_p = p;
4534 while (_p < aligned_end) {
4535 unsigned long value = *(unsigned long *) _p;
4536 if (value & ASCII_CHAR_MASK)
4537 break;
4538 _p += SIZEOF_LONG;
4539 }
4540 p = _p;
4541 if (_p == end)
4542 break;
4543 }
4544 if ((unsigned char)*p & 0x80)
4545 break;
4546 ++p;
4547 }
4548 memcpy(dest, start, p - start);
4549 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550}
Antoine Pitrouab868312009-01-10 15:40:25 +00004551
Victor Stinner785938e2011-12-11 20:09:03 +01004552PyObject *
4553PyUnicode_DecodeUTF8Stateful(const char *s,
4554 Py_ssize_t size,
4555 const char *errors,
4556 Py_ssize_t *consumed)
4557{
Victor Stinner785938e2011-12-11 20:09:03 +01004558 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004559 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004560 const char *end = s + size;
4561 Py_ssize_t outpos;
4562
4563 Py_ssize_t startinpos;
4564 Py_ssize_t endinpos;
4565 const char *errmsg = "";
4566 PyObject *errorHandler = NULL;
4567 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004568
4569 if (size == 0) {
4570 if (consumed)
4571 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004572 Py_INCREF(unicode_empty);
4573 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004574 }
4575
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004576 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4577 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004578 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004579 *consumed = 1;
4580 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004581 }
4582
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004583 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004584 if (!unicode)
4585 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004586
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004587 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4588 s += outpos;
4589 while (s < end) {
4590 Py_UCS4 ch;
4591 int kind = PyUnicode_KIND(unicode);
4592 if (kind == PyUnicode_1BYTE_KIND) {
4593 if (PyUnicode_IS_ASCII(unicode))
4594 ch = asciilib_utf8_decode(&s, end,
4595 PyUnicode_1BYTE_DATA(unicode), &outpos);
4596 else
4597 ch = ucs1lib_utf8_decode(&s, end,
4598 PyUnicode_1BYTE_DATA(unicode), &outpos);
4599 } else if (kind == PyUnicode_2BYTE_KIND) {
4600 ch = ucs2lib_utf8_decode(&s, end,
4601 PyUnicode_2BYTE_DATA(unicode), &outpos);
4602 } else {
4603 assert(kind == PyUnicode_4BYTE_KIND);
4604 ch = ucs4lib_utf8_decode(&s, end,
4605 PyUnicode_4BYTE_DATA(unicode), &outpos);
4606 }
4607
4608 switch (ch) {
4609 case 0:
4610 if (s == end || consumed)
4611 goto End;
4612 errmsg = "unexpected end of data";
4613 startinpos = s - starts;
4614 endinpos = startinpos + 1;
4615 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4616 endinpos++;
4617 break;
4618 case 1:
4619 errmsg = "invalid start byte";
4620 startinpos = s - starts;
4621 endinpos = startinpos + 1;
4622 break;
4623 case 2:
4624 errmsg = "invalid continuation byte";
4625 startinpos = s - starts;
4626 endinpos = startinpos + 1;
4627 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4628 endinpos++;
4629 break;
4630 default:
4631 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4632 goto onError;
4633 continue;
4634 }
4635
4636 if (unicode_decode_call_errorhandler(
4637 errors, &errorHandler,
4638 "utf-8", errmsg,
4639 &starts, &end, &startinpos, &endinpos, &exc, &s,
4640 &unicode, &outpos))
4641 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004642 }
4643
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004644End:
4645 if (unicode_resize(&unicode, outpos) < 0)
4646 goto onError;
4647
4648 if (consumed)
4649 *consumed = s - starts;
4650
4651 Py_XDECREF(errorHandler);
4652 Py_XDECREF(exc);
4653 assert(_PyUnicode_CheckConsistency(unicode, 1));
4654 return unicode;
4655
4656onError:
4657 Py_XDECREF(errorHandler);
4658 Py_XDECREF(exc);
4659 Py_XDECREF(unicode);
4660 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004661}
4662
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004663#ifdef __APPLE__
4664
4665/* Simplified UTF-8 decoder using surrogateescape error handler,
4666 used to decode the command line arguments on Mac OS X. */
4667
4668wchar_t*
4669_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4670{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004671 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004672 wchar_t *unicode;
4673 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004674
4675 /* Note: size will always be longer than the resulting Unicode
4676 character count */
4677 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4678 PyErr_NoMemory();
4679 return NULL;
4680 }
4681 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4682 if (!unicode)
4683 return NULL;
4684
4685 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004686 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004687 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004688 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004689 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004690#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004691 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004692#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004694#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004695 if (ch > 0xFF) {
4696#if SIZEOF_WCHAR_T == 4
4697 assert(0);
4698#else
4699 assert(Py_UNICODE_IS_SURROGATE(ch));
4700 /* compute and append the two surrogates: */
4701 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4702 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4703#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004704 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004705 else {
4706 if (!ch && s == e)
4707 break;
4708 /* surrogateescape */
4709 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4710 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004711 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004712 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004713 return unicode;
4714}
4715
4716#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004718/* Primary internal function which creates utf8 encoded bytes objects.
4719
4720 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004721 and allocate exactly as much space needed at the end. Else allocate the
4722 maximum possible needed (4 result bytes per Unicode character), and return
4723 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004724*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004725PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004726_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727{
Victor Stinner6099a032011-12-18 14:22:26 +01004728 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004729 void *data;
4730 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004732 if (!PyUnicode_Check(unicode)) {
4733 PyErr_BadArgument();
4734 return NULL;
4735 }
4736
4737 if (PyUnicode_READY(unicode) == -1)
4738 return NULL;
4739
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004740 if (PyUnicode_UTF8(unicode))
4741 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4742 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004743
4744 kind = PyUnicode_KIND(unicode);
4745 data = PyUnicode_DATA(unicode);
4746 size = PyUnicode_GET_LENGTH(unicode);
4747
Benjamin Petersonead6b532011-12-20 17:23:42 -06004748 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004749 default:
4750 assert(0);
4751 case PyUnicode_1BYTE_KIND:
4752 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4753 assert(!PyUnicode_IS_ASCII(unicode));
4754 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4755 case PyUnicode_2BYTE_KIND:
4756 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4757 case PyUnicode_4BYTE_KIND:
4758 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760}
4761
Alexander Belopolsky40018472011-02-26 01:02:56 +00004762PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004763PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4764 Py_ssize_t size,
4765 const char *errors)
4766{
4767 PyObject *v, *unicode;
4768
4769 unicode = PyUnicode_FromUnicode(s, size);
4770 if (unicode == NULL)
4771 return NULL;
4772 v = _PyUnicode_AsUTF8String(unicode, errors);
4773 Py_DECREF(unicode);
4774 return v;
4775}
4776
4777PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004778PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004780 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781}
4782
Walter Dörwald41980ca2007-08-16 21:55:45 +00004783/* --- UTF-32 Codec ------------------------------------------------------- */
4784
4785PyObject *
4786PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 Py_ssize_t size,
4788 const char *errors,
4789 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004790{
4791 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4792}
4793
4794PyObject *
4795PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004796 Py_ssize_t size,
4797 const char *errors,
4798 int *byteorder,
4799 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004800{
4801 const char *starts = s;
4802 Py_ssize_t startinpos;
4803 Py_ssize_t endinpos;
4804 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004805 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004806 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004807 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004808 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004809 PyObject *errorHandler = NULL;
4810 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004811
Walter Dörwald41980ca2007-08-16 21:55:45 +00004812 q = (unsigned char *)s;
4813 e = q + size;
4814
4815 if (byteorder)
4816 bo = *byteorder;
4817
4818 /* Check for BOM marks (U+FEFF) in the input and adjust current
4819 byte order setting accordingly. In native mode, the leading BOM
4820 mark is skipped, in all other modes, it is copied to the output
4821 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004822 if (bo == 0 && size >= 4) {
4823 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4824 if (bom == 0x0000FEFF) {
4825 bo = -1;
4826 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004827 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004828 else if (bom == 0xFFFE0000) {
4829 bo = 1;
4830 q += 4;
4831 }
4832 if (byteorder)
4833 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004834 }
4835
Victor Stinnere64322e2012-10-30 23:12:47 +01004836 if (q == e) {
4837 if (consumed)
4838 *consumed = size;
4839 Py_INCREF(unicode_empty);
4840 return unicode_empty;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004841 }
4842
Victor Stinnere64322e2012-10-30 23:12:47 +01004843#ifdef WORDS_BIGENDIAN
4844 le = bo < 0;
4845#else
4846 le = bo <= 0;
4847#endif
4848
4849 unicode = PyUnicode_New((e - q + 3) / 4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004850 if (!unicode)
4851 return NULL;
Victor Stinnere64322e2012-10-30 23:12:47 +01004852
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004853 outpos = 0;
Victor Stinnere64322e2012-10-30 23:12:47 +01004854 while (1) {
4855 Py_UCS4 ch = 0;
4856 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004857
Victor Stinnere64322e2012-10-30 23:12:47 +01004858 if (e - q >= 4) {
4859 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
4860 void *data = PyUnicode_DATA(unicode);
4861 const unsigned char *last = e - 4;
4862 if (le) {
4863 do {
4864 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4865 if (ch > maxch)
4866 break;
4867 PyUnicode_WRITE(kind, data, outpos++, ch);
4868 q += 4;
4869 } while (q <= last);
4870 }
4871 else {
4872 do {
4873 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4874 if (ch > maxch)
4875 break;
4876 PyUnicode_WRITE(kind, data, outpos++, ch);
4877 q += 4;
4878 } while (q <= last);
4879 }
4880 }
4881
4882 if (ch <= maxch) {
4883 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004884 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01004885 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00004886 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01004887 startinpos = ((const char *)q) - starts;
4888 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00004889 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004890 else {
4891 if (ch < 0x110000) {
4892 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4893 goto onError;
4894 q += 4;
4895 continue;
4896 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01004898 startinpos = ((const char *)q) - starts;
4899 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004901
4902 /* The remaining input chars are ignored if the callback
4903 chooses to skip the input */
Benjamin Peterson29060642009-01-31 22:14:21 +00004904 if (unicode_decode_call_errorhandler(
4905 errors, &errorHandler,
4906 "utf32", errmsg,
4907 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004908 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004909 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004910 }
4911
Walter Dörwald41980ca2007-08-16 21:55:45 +00004912 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004914
4915 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01004916 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004917 goto onError;
4918
4919 Py_XDECREF(errorHandler);
4920 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004921 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004922
Benjamin Peterson29060642009-01-31 22:14:21 +00004923 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004924 Py_DECREF(unicode);
4925 Py_XDECREF(errorHandler);
4926 Py_XDECREF(exc);
4927 return NULL;
4928}
4929
4930PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004931_PyUnicode_EncodeUTF32(PyObject *str,
4932 const char *errors,
4933 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004934{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004935 int kind;
4936 void *data;
4937 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004938 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004939 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004940 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004941 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02004942#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00004943 int iorder[] = {0, 1, 2, 3};
4944#else
4945 int iorder[] = {3, 2, 1, 0};
4946#endif
4947
Benjamin Peterson29060642009-01-31 22:14:21 +00004948#define STORECHAR(CH) \
4949 do { \
4950 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4951 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4952 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4953 p[iorder[0]] = (CH) & 0xff; \
4954 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955 } while(0)
4956
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004957 if (!PyUnicode_Check(str)) {
4958 PyErr_BadArgument();
4959 return NULL;
4960 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004961 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004962 return NULL;
4963 kind = PyUnicode_KIND(str);
4964 data = PyUnicode_DATA(str);
4965 len = PyUnicode_GET_LENGTH(str);
4966
4967 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004968 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004970 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004971 if (v == NULL)
4972 return NULL;
4973
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004974 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004975 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004977 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004978 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004979
4980 if (byteorder == -1) {
4981 /* force LE */
4982 iorder[0] = 0;
4983 iorder[1] = 1;
4984 iorder[2] = 2;
4985 iorder[3] = 3;
4986 }
4987 else if (byteorder == 1) {
4988 /* force BE */
4989 iorder[0] = 3;
4990 iorder[1] = 2;
4991 iorder[2] = 1;
4992 iorder[3] = 0;
4993 }
4994
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004995 for (i = 0; i < len; i++)
4996 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00004997
4998 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004999 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005000#undef STORECHAR
5001}
5002
Alexander Belopolsky40018472011-02-26 01:02:56 +00005003PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005004PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5005 Py_ssize_t size,
5006 const char *errors,
5007 int byteorder)
5008{
5009 PyObject *result;
5010 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5011 if (tmp == NULL)
5012 return NULL;
5013 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5014 Py_DECREF(tmp);
5015 return result;
5016}
5017
5018PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005019PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005020{
Victor Stinnerb960b342011-11-20 19:12:52 +01005021 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005022}
5023
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024/* --- UTF-16 Codec ------------------------------------------------------- */
5025
Tim Peters772747b2001-08-09 22:21:55 +00005026PyObject *
5027PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 Py_ssize_t size,
5029 const char *errors,
5030 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031{
Walter Dörwald69652032004-09-07 20:24:22 +00005032 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5033}
5034
5035PyObject *
5036PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 Py_ssize_t size,
5038 const char *errors,
5039 int *byteorder,
5040 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005041{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005042 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005043 Py_ssize_t startinpos;
5044 Py_ssize_t endinpos;
5045 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005046 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005047 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005048 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005049 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005050 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005051 PyObject *errorHandler = NULL;
5052 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053
Tim Peters772747b2001-08-09 22:21:55 +00005054 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005055 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056
5057 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005058 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005060 /* Check for BOM marks (U+FEFF) in the input and adjust current
5061 byte order setting accordingly. In native mode, the leading BOM
5062 mark is skipped, in all other modes, it is copied to the output
5063 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005064 if (bo == 0 && size >= 2) {
5065 const Py_UCS4 bom = (q[1] << 8) | q[0];
5066 if (bom == 0xFEFF) {
5067 q += 2;
5068 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005070 else if (bom == 0xFFFE) {
5071 q += 2;
5072 bo = 1;
5073 }
5074 if (byteorder)
5075 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005076 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077
Antoine Pitrou63065d72012-05-15 23:48:04 +02005078 if (q == e) {
5079 if (consumed)
5080 *consumed = size;
5081 Py_INCREF(unicode_empty);
5082 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005083 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005084
Christian Heimes743e0cd2012-10-17 23:52:17 +02005085#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005086 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005087#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005088 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005089#endif
Tim Peters772747b2001-08-09 22:21:55 +00005090
Antoine Pitrou63065d72012-05-15 23:48:04 +02005091 /* Note: size will always be longer than the resulting Unicode
5092 character count */
5093 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5094 if (!unicode)
5095 return NULL;
5096
5097 outpos = 0;
5098 while (1) {
5099 Py_UCS4 ch = 0;
5100 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005101 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005102 if (kind == PyUnicode_1BYTE_KIND) {
5103 if (PyUnicode_IS_ASCII(unicode))
5104 ch = asciilib_utf16_decode(&q, e,
5105 PyUnicode_1BYTE_DATA(unicode), &outpos,
5106 native_ordering);
5107 else
5108 ch = ucs1lib_utf16_decode(&q, e,
5109 PyUnicode_1BYTE_DATA(unicode), &outpos,
5110 native_ordering);
5111 } else if (kind == PyUnicode_2BYTE_KIND) {
5112 ch = ucs2lib_utf16_decode(&q, e,
5113 PyUnicode_2BYTE_DATA(unicode), &outpos,
5114 native_ordering);
5115 } else {
5116 assert(kind == PyUnicode_4BYTE_KIND);
5117 ch = ucs4lib_utf16_decode(&q, e,
5118 PyUnicode_4BYTE_DATA(unicode), &outpos,
5119 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005120 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005121 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122
Antoine Pitrou63065d72012-05-15 23:48:04 +02005123 switch (ch)
5124 {
5125 case 0:
5126 /* remaining byte at the end? (size should be even) */
5127 if (q == e || consumed)
5128 goto End;
5129 errmsg = "truncated data";
5130 startinpos = ((const char *)q) - starts;
5131 endinpos = ((const char *)e) - starts;
5132 break;
5133 /* The remaining input chars are ignored if the callback
5134 chooses to skip the input */
5135 case 1:
5136 errmsg = "unexpected end of data";
5137 startinpos = ((const char *)q) - 2 - starts;
5138 endinpos = ((const char *)e) - starts;
5139 break;
5140 case 2:
5141 errmsg = "illegal encoding";
5142 startinpos = ((const char *)q) - 2 - starts;
5143 endinpos = startinpos + 2;
5144 break;
5145 case 3:
5146 errmsg = "illegal UTF-16 surrogate";
5147 startinpos = ((const char *)q) - 4 - starts;
5148 endinpos = startinpos + 2;
5149 break;
5150 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005151 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5152 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005153 continue;
5154 }
5155
Benjamin Peterson29060642009-01-31 22:14:21 +00005156 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005157 errors,
5158 &errorHandler,
5159 "utf16", errmsg,
5160 &starts,
5161 (const char **)&e,
5162 &startinpos,
5163 &endinpos,
5164 &exc,
5165 (const char **)&q,
5166 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005167 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169 }
5170
Antoine Pitrou63065d72012-05-15 23:48:04 +02005171End:
Walter Dörwald69652032004-09-07 20:24:22 +00005172 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005174
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005176 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 goto onError;
5178
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179 Py_XDECREF(errorHandler);
5180 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005181 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005185 Py_XDECREF(errorHandler);
5186 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 return NULL;
5188}
5189
Tim Peters772747b2001-08-09 22:21:55 +00005190PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005191_PyUnicode_EncodeUTF16(PyObject *str,
5192 const char *errors,
5193 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005195 enum PyUnicode_Kind kind;
5196 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005197 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005198 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005199 unsigned short *out;
5200 Py_ssize_t bytesize;
5201 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005202#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005203 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005204#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005205 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005206#endif
5207
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005208 if (!PyUnicode_Check(str)) {
5209 PyErr_BadArgument();
5210 return NULL;
5211 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005212 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005213 return NULL;
5214 kind = PyUnicode_KIND(str);
5215 data = PyUnicode_DATA(str);
5216 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005217
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005218 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005219 if (kind == PyUnicode_4BYTE_KIND) {
5220 const Py_UCS4 *in = (const Py_UCS4 *)data;
5221 const Py_UCS4 *end = in + len;
5222 while (in < end)
5223 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005224 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005225 }
5226 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005228 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005229 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 if (v == NULL)
5231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005233 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005234 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005235 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005237 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005238 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005239 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005240
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005241 switch (kind) {
5242 case PyUnicode_1BYTE_KIND: {
5243 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5244 break;
Tim Peters772747b2001-08-09 22:21:55 +00005245 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005246 case PyUnicode_2BYTE_KIND: {
5247 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5248 break;
Tim Peters772747b2001-08-09 22:21:55 +00005249 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005250 case PyUnicode_4BYTE_KIND: {
5251 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5252 break;
5253 }
5254 default:
5255 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005256 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005257
5258 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005259 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260}
5261
Alexander Belopolsky40018472011-02-26 01:02:56 +00005262PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005263PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5264 Py_ssize_t size,
5265 const char *errors,
5266 int byteorder)
5267{
5268 PyObject *result;
5269 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5270 if (tmp == NULL)
5271 return NULL;
5272 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5273 Py_DECREF(tmp);
5274 return result;
5275}
5276
5277PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005278PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005280 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281}
5282
5283/* --- Unicode Escape Codec ----------------------------------------------- */
5284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005285/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5286 if all the escapes in the string make it still a valid ASCII string.
5287 Returns -1 if any escapes were found which cause the string to
5288 pop out of ASCII range. Otherwise returns the length of the
5289 required buffer to hold the string.
5290 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005291static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005292length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5293{
5294 const unsigned char *p = (const unsigned char *)s;
5295 const unsigned char *end = p + size;
5296 Py_ssize_t length = 0;
5297
5298 if (size < 0)
5299 return -1;
5300
5301 for (; p < end; ++p) {
5302 if (*p > 127) {
5303 /* Non-ASCII */
5304 return -1;
5305 }
5306 else if (*p != '\\') {
5307 /* Normal character */
5308 ++length;
5309 }
5310 else {
5311 /* Backslash-escape, check next char */
5312 ++p;
5313 /* Escape sequence reaches till end of string or
5314 non-ASCII follow-up. */
5315 if (p >= end || *p > 127)
5316 return -1;
5317 switch (*p) {
5318 case '\n':
5319 /* backslash + \n result in zero characters */
5320 break;
5321 case '\\': case '\'': case '\"':
5322 case 'b': case 'f': case 't':
5323 case 'n': case 'r': case 'v': case 'a':
5324 ++length;
5325 break;
5326 case '0': case '1': case '2': case '3':
5327 case '4': case '5': case '6': case '7':
5328 case 'x': case 'u': case 'U': case 'N':
5329 /* these do not guarantee ASCII characters */
5330 return -1;
5331 default:
5332 /* count the backslash + the other character */
5333 length += 2;
5334 }
5335 }
5336 }
5337 return length;
5338}
5339
Fredrik Lundh06d12682001-01-24 07:59:11 +00005340static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005341
Alexander Belopolsky40018472011-02-26 01:02:56 +00005342PyObject *
5343PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005344 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005345 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005348 Py_ssize_t startinpos;
5349 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005350 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005351 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005353 char* message;
5354 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355 PyObject *errorHandler = NULL;
5356 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005357 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005358 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005359
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005360 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005361
5362 /* After length_of_escaped_ascii_string() there are two alternatives,
5363 either the string is pure ASCII with named escapes like \n, etc.
5364 and we determined it's exact size (common case)
5365 or it contains \x, \u, ... escape sequences. then we create a
5366 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005367 if (len >= 0) {
5368 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005369 if (!v)
5370 goto onError;
5371 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005372 }
5373 else {
5374 /* Escaped strings will always be longer than the resulting
5375 Unicode string, so we start with size here and then reduce the
5376 length after conversion to the true value.
5377 (but if the error callback returns a long replacement string
5378 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005379 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005380 if (!v)
5381 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005382 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005383 }
5384
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005386 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005387 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005389
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 while (s < end) {
5391 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005392 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005393 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005395 /* The only case in which i == ascii_length is a backslash
5396 followed by a newline. */
5397 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005398
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 /* Non-escape characters are interpreted as Unicode ordinals */
5400 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005401 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5402 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 continue;
5404 }
5405
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005406 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 /* \ - Escapes */
5408 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005409 c = *s++;
5410 if (s > end)
5411 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005412
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005413 /* The only case in which i == ascii_length is a backslash
5414 followed by a newline. */
5415 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005416
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005417 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005420#define WRITECHAR(ch) \
5421 do { \
5422 if (unicode_putchar(&v, &i, ch) < 0) \
5423 goto onError; \
5424 }while(0)
5425
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005427 case '\\': WRITECHAR('\\'); break;
5428 case '\'': WRITECHAR('\''); break;
5429 case '\"': WRITECHAR('\"'); break;
5430 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005432 case 'f': WRITECHAR('\014'); break;
5433 case 't': WRITECHAR('\t'); break;
5434 case 'n': WRITECHAR('\n'); break;
5435 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005436 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005437 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005438 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005439 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 case '0': case '1': case '2': case '3':
5443 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005444 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005445 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005446 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005447 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005448 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005450 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 break;
5452
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 /* hex escapes */
5454 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005456 digits = 2;
5457 message = "truncated \\xXX escape";
5458 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005462 digits = 4;
5463 message = "truncated \\uXXXX escape";
5464 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465
Benjamin Peterson29060642009-01-31 22:14:21 +00005466 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005467 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005468 digits = 8;
5469 message = "truncated \\UXXXXXXXX escape";
5470 hexescape:
5471 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005472 if (s+digits>end) {
5473 endinpos = size;
5474 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 errors, &errorHandler,
5476 "unicodeescape", "end of string in escape sequence",
5477 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005478 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005479 goto onError;
5480 goto nextByte;
5481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005482 for (j = 0; j < digits; ++j) {
5483 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005484 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005485 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005486 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005487 errors, &errorHandler,
5488 "unicodeescape", message,
5489 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005490 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005491 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005492 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005493 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005494 }
5495 chr = (chr<<4) & ~0xF;
5496 if (c >= '0' && c <= '9')
5497 chr += c - '0';
5498 else if (c >= 'a' && c <= 'f')
5499 chr += 10 + c - 'a';
5500 else
5501 chr += 10 + c - 'A';
5502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005503 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005504 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 /* _decoding_error will have already written into the
5506 target buffer. */
5507 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005508 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005509 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005510 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005511 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005512 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005513 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005514 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 errors, &errorHandler,
5516 "unicodeescape", "illegal Unicode character",
5517 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005518 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005519 goto onError;
5520 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005521 break;
5522
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005524 case 'N':
5525 message = "malformed \\N character escape";
5526 if (ucnhash_CAPI == NULL) {
5527 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005528 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5529 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005530 if (ucnhash_CAPI == NULL)
5531 goto ucnhashError;
5532 }
5533 if (*s == '{') {
5534 const char *start = s+1;
5535 /* look for the closing brace */
5536 while (*s != '}' && s < end)
5537 s++;
5538 if (s > start && s < end && *s == '}') {
5539 /* found a name. look it up in the unicode database */
5540 message = "unknown Unicode character name";
5541 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005542 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005543 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005544 goto store;
5545 }
5546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005547 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005548 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005549 errors, &errorHandler,
5550 "unicodeescape", message,
5551 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005552 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005553 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005554 break;
5555
5556 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005557 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005558 message = "\\ at end of string";
5559 s--;
5560 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005561 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 errors, &errorHandler,
5563 "unicodeescape", message,
5564 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005565 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005566 goto onError;
5567 }
5568 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005569 WRITECHAR('\\');
5570 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005571 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005572 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005575 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005577#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005578
Victor Stinner16e6a802011-12-12 13:24:15 +01005579 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005580 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005581 Py_XDECREF(errorHandler);
5582 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005583 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005584
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005586 PyErr_SetString(
5587 PyExc_UnicodeError,
5588 "\\N escapes not supported (can't load unicodedata module)"
5589 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005590 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 Py_XDECREF(errorHandler);
5592 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005593 return NULL;
5594
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597 Py_XDECREF(errorHandler);
5598 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 return NULL;
5600}
5601
5602/* Return a Unicode-Escape string version of the Unicode object.
5603
5604 If quotes is true, the string is enclosed in u"" or u'' quotes as
5605 appropriate.
5606
5607*/
5608
Alexander Belopolsky40018472011-02-26 01:02:56 +00005609PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005610PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005612 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005613 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005615 int kind;
5616 void *data;
5617 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618
Ezio Melottie7f90372012-10-05 03:33:31 +03005619 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005620 escape.
5621
Ezio Melottie7f90372012-10-05 03:33:31 +03005622 For UCS1 strings it's '\xxx', 4 bytes per source character.
5623 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5624 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005625 */
5626
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005627 if (!PyUnicode_Check(unicode)) {
5628 PyErr_BadArgument();
5629 return NULL;
5630 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005631 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005632 return NULL;
5633 len = PyUnicode_GET_LENGTH(unicode);
5634 kind = PyUnicode_KIND(unicode);
5635 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005636 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005637 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5638 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5639 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5640 }
5641
5642 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005643 return PyBytes_FromStringAndSize(NULL, 0);
5644
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005645 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005647
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005648 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005650 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 if (repr == NULL)
5653 return NULL;
5654
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005655 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005657 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005658 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005659
Walter Dörwald79e913e2007-05-12 11:08:06 +00005660 /* Escape backslashes */
5661 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 *p++ = '\\';
5663 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005664 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005665 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005666
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005667 /* Map 21-bit characters to '\U00xxxxxx' */
5668 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005669 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005670 *p++ = '\\';
5671 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005672 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5673 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5674 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5675 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5676 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5677 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5678 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5679 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005681 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005682
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005684 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 *p++ = '\\';
5686 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005687 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5688 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5689 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5690 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005692
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005693 /* Map special whitespace to '\t', \n', '\r' */
5694 else if (ch == '\t') {
5695 *p++ = '\\';
5696 *p++ = 't';
5697 }
5698 else if (ch == '\n') {
5699 *p++ = '\\';
5700 *p++ = 'n';
5701 }
5702 else if (ch == '\r') {
5703 *p++ = '\\';
5704 *p++ = 'r';
5705 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005706
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005707 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005708 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005710 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005711 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5712 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005713 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005714
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 /* Copy everything else as-is */
5716 else
5717 *p++ = (char) ch;
5718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005720 assert(p - PyBytes_AS_STRING(repr) > 0);
5721 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5722 return NULL;
5723 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724}
5725
Alexander Belopolsky40018472011-02-26 01:02:56 +00005726PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005727PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5728 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005730 PyObject *result;
5731 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5732 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005734 result = PyUnicode_AsUnicodeEscapeString(tmp);
5735 Py_DECREF(tmp);
5736 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737}
5738
5739/* --- Raw Unicode Escape Codec ------------------------------------------- */
5740
Alexander Belopolsky40018472011-02-26 01:02:56 +00005741PyObject *
5742PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005743 Py_ssize_t size,
5744 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005747 Py_ssize_t startinpos;
5748 Py_ssize_t endinpos;
5749 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005750 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 const char *end;
5752 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753 PyObject *errorHandler = NULL;
5754 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005755
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 /* Escaped strings will always be longer than the resulting
5757 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 length after conversion to the true value. (But decoding error
5759 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005760 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005764 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005765 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 end = s + size;
5767 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 unsigned char c;
5769 Py_UCS4 x;
5770 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005771 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 /* Non-escape characters are interpreted as Unicode ordinals */
5774 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005775 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5776 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005778 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 startinpos = s-starts;
5780
5781 /* \u-escapes are only interpreted iff the number of leading
5782 backslashes if odd */
5783 bs = s;
5784 for (;s < end;) {
5785 if (*s != '\\')
5786 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005787 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5788 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 }
5790 if (((s - bs) & 1) == 0 ||
5791 s >= end ||
5792 (*s != 'u' && *s != 'U')) {
5793 continue;
5794 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005795 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 count = *s=='u' ? 4 : 8;
5797 s++;
5798
5799 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005800 for (x = 0, i = 0; i < count; ++i, ++s) {
5801 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005802 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 endinpos = s-starts;
5804 if (unicode_decode_call_errorhandler(
5805 errors, &errorHandler,
5806 "rawunicodeescape", "truncated \\uXXXX",
5807 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005808 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 goto onError;
5810 goto nextByte;
5811 }
5812 x = (x<<4) & ~0xF;
5813 if (c >= '0' && c <= '9')
5814 x += c - '0';
5815 else if (c >= 'a' && c <= 'f')
5816 x += 10 + c - 'a';
5817 else
5818 x += 10 + c - 'A';
5819 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005820 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005821 if (unicode_putchar(&v, &outpos, x) < 0)
5822 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005823 } else {
5824 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005825 if (unicode_decode_call_errorhandler(
5826 errors, &errorHandler,
5827 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005829 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005831 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 nextByte:
5833 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834 }
Victor Stinner16e6a802011-12-12 13:24:15 +01005835 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005837 Py_XDECREF(errorHandler);
5838 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005839 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00005840
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005843 Py_XDECREF(errorHandler);
5844 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 return NULL;
5846}
5847
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005848
Alexander Belopolsky40018472011-02-26 01:02:56 +00005849PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005850PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005852 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 char *p;
5854 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005855 Py_ssize_t expandsize, pos;
5856 int kind;
5857 void *data;
5858 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005860 if (!PyUnicode_Check(unicode)) {
5861 PyErr_BadArgument();
5862 return NULL;
5863 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005864 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005865 return NULL;
5866 kind = PyUnicode_KIND(unicode);
5867 data = PyUnicode_DATA(unicode);
5868 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005869 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5870 bytes, and 1 byte characters 4. */
5871 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005872
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005873 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005875
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005876 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 if (repr == NULL)
5878 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005879 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005880 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005882 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005883 for (pos = 0; pos < len; pos++) {
5884 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 /* Map 32-bit characters to '\Uxxxxxxxx' */
5886 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005887 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005888 *p++ = '\\';
5889 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005890 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5891 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5892 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5893 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5894 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5895 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5896 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5897 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005898 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005900 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 *p++ = '\\';
5902 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005903 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5904 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5905 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5906 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 /* Copy everything else as-is */
5909 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 *p++ = (char) ch;
5911 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005912
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005913 assert(p > q);
5914 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005915 return NULL;
5916 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917}
5918
Alexander Belopolsky40018472011-02-26 01:02:56 +00005919PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005920PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5921 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005923 PyObject *result;
5924 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5925 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005926 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005927 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5928 Py_DECREF(tmp);
5929 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930}
5931
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005932/* --- Unicode Internal Codec ------------------------------------------- */
5933
Alexander Belopolsky40018472011-02-26 01:02:56 +00005934PyObject *
5935_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005936 Py_ssize_t size,
5937 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005938{
5939 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005940 Py_ssize_t startinpos;
5941 Py_ssize_t endinpos;
5942 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005943 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005944 const char *end;
5945 const char *reason;
5946 PyObject *errorHandler = NULL;
5947 PyObject *exc = NULL;
5948
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005949 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005950 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005951 1))
5952 return NULL;
5953
Thomas Wouters89f507f2006-12-13 04:49:30 +00005954 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005955 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005956 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005958 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005959 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005960 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005961 end = s + size;
5962
5963 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005964 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005965 Py_UCS4 ch;
5966 /* We copy the raw representation one byte at a time because the
5967 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005968 ((char *) &uch)[0] = s[0];
5969 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005970#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005971 ((char *) &uch)[2] = s[2];
5972 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005973#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005974 ch = uch;
5975
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005976 /* We have to sanity check the raw data, otherwise doom looms for
5977 some malformed UCS-4 data. */
5978 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005979#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005980 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005981#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005982 end-s < Py_UNICODE_SIZE
5983 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005985 startinpos = s - starts;
5986 if (end-s < Py_UNICODE_SIZE) {
5987 endinpos = end-starts;
5988 reason = "truncated input";
5989 }
5990 else {
5991 endinpos = s - starts + Py_UNICODE_SIZE;
5992 reason = "illegal code point (> 0x10FFFF)";
5993 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005994 if (unicode_decode_call_errorhandler(
5995 errors, &errorHandler,
5996 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005997 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005998 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005999 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006000 continue;
6001 }
6002
6003 s += Py_UNICODE_SIZE;
6004#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006005 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006006 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006007 Py_UNICODE uch2;
6008 ((char *) &uch2)[0] = s[0];
6009 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006010 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006011 {
Victor Stinner551ac952011-11-29 22:58:13 +01006012 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006013 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006014 }
6015 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006016#endif
6017
6018 if (unicode_putchar(&v, &outpos, ch) < 0)
6019 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006020 }
6021
Victor Stinner16e6a802011-12-12 13:24:15 +01006022 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006023 goto onError;
6024 Py_XDECREF(errorHandler);
6025 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006026 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006027
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006029 Py_XDECREF(v);
6030 Py_XDECREF(errorHandler);
6031 Py_XDECREF(exc);
6032 return NULL;
6033}
6034
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035/* --- Latin-1 Codec ------------------------------------------------------ */
6036
Alexander Belopolsky40018472011-02-26 01:02:56 +00006037PyObject *
6038PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006039 Py_ssize_t size,
6040 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006043 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044}
6045
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006046/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006047static void
6048make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006049 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006050 PyObject *unicode,
6051 Py_ssize_t startpos, Py_ssize_t endpos,
6052 const char *reason)
6053{
6054 if (*exceptionObject == NULL) {
6055 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006056 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006057 encoding, unicode, startpos, endpos, reason);
6058 }
6059 else {
6060 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6061 goto onError;
6062 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6063 goto onError;
6064 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6065 goto onError;
6066 return;
6067 onError:
6068 Py_DECREF(*exceptionObject);
6069 *exceptionObject = NULL;
6070 }
6071}
6072
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006073/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006074static void
6075raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006076 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006077 PyObject *unicode,
6078 Py_ssize_t startpos, Py_ssize_t endpos,
6079 const char *reason)
6080{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006081 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006082 encoding, unicode, startpos, endpos, reason);
6083 if (*exceptionObject != NULL)
6084 PyCodec_StrictErrors(*exceptionObject);
6085}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086
6087/* error handling callback helper:
6088 build arguments, call the callback and check the arguments,
6089 put the result into newpos and return the replacement string, which
6090 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006091static PyObject *
6092unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006093 PyObject **errorHandler,
6094 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006095 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006096 Py_ssize_t startpos, Py_ssize_t endpos,
6097 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006098{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006099 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006100 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006101 PyObject *restuple;
6102 PyObject *resunicode;
6103
6104 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108 }
6109
Benjamin Petersonbac79492012-01-14 13:34:47 -05006110 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006111 return NULL;
6112 len = PyUnicode_GET_LENGTH(unicode);
6113
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006114 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006115 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006116 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006118
6119 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006123 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006124 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 Py_DECREF(restuple);
6126 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006127 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006128 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 &resunicode, newpos)) {
6130 Py_DECREF(restuple);
6131 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006132 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006133 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6134 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6135 Py_DECREF(restuple);
6136 return NULL;
6137 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006139 *newpos = len + *newpos;
6140 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6142 Py_DECREF(restuple);
6143 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006144 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145 Py_INCREF(resunicode);
6146 Py_DECREF(restuple);
6147 return resunicode;
6148}
6149
Alexander Belopolsky40018472011-02-26 01:02:56 +00006150static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006151unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006152 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006153 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006155 /* input state */
6156 Py_ssize_t pos=0, size;
6157 int kind;
6158 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006159 /* output object */
6160 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006161 /* pointer into the output */
6162 char *str;
6163 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006164 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006165 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6166 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006167 PyObject *errorHandler = NULL;
6168 PyObject *exc = NULL;
6169 /* the following variable is used for caching string comparisons
6170 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6171 int known_errorHandler = -1;
6172
Benjamin Petersonbac79492012-01-14 13:34:47 -05006173 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006174 return NULL;
6175 size = PyUnicode_GET_LENGTH(unicode);
6176 kind = PyUnicode_KIND(unicode);
6177 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006178 /* allocate enough for a simple encoding without
6179 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006180 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006181 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006182 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006183 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006184 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006185 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006186 ressize = size;
6187
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006188 while (pos < size) {
6189 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006190
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 /* can we encode this? */
6192 if (c<limit) {
6193 /* no overflow check, because we know that the space is enough */
6194 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006195 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006196 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 Py_ssize_t requiredsize;
6199 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006200 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006202 Py_ssize_t collstart = pos;
6203 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006205 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 ++collend;
6207 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6208 if (known_errorHandler==-1) {
6209 if ((errors==NULL) || (!strcmp(errors, "strict")))
6210 known_errorHandler = 1;
6211 else if (!strcmp(errors, "replace"))
6212 known_errorHandler = 2;
6213 else if (!strcmp(errors, "ignore"))
6214 known_errorHandler = 3;
6215 else if (!strcmp(errors, "xmlcharrefreplace"))
6216 known_errorHandler = 4;
6217 else
6218 known_errorHandler = 0;
6219 }
6220 switch (known_errorHandler) {
6221 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006222 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 goto onError;
6224 case 2: /* replace */
6225 while (collstart++<collend)
6226 *str++ = '?'; /* fall through */
6227 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006228 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 break;
6230 case 4: /* xmlcharrefreplace */
6231 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006232 /* determine replacement size */
6233 for (i = collstart, repsize = 0; i < collend; ++i) {
6234 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6235 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006237 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006239 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006241 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006243 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006245 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006247 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006248 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006250 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006252 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 if (requiredsize > ressize) {
6254 if (requiredsize<2*ressize)
6255 requiredsize = 2*ressize;
6256 if (_PyBytes_Resize(&res, requiredsize))
6257 goto onError;
6258 str = PyBytes_AS_STRING(res) + respos;
6259 ressize = requiredsize;
6260 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006261 /* generate replacement */
6262 for (i = collstart; i < collend; ++i) {
6263 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006265 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 break;
6267 default:
6268 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006269 encoding, reason, unicode, &exc,
6270 collstart, collend, &newpos);
6271 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006272 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006274 if (PyBytes_Check(repunicode)) {
6275 /* Directly copy bytes result to output. */
6276 repsize = PyBytes_Size(repunicode);
6277 if (repsize > 1) {
6278 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006279 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006280 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6281 Py_DECREF(repunicode);
6282 goto onError;
6283 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006284 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006285 ressize += repsize-1;
6286 }
6287 memcpy(str, PyBytes_AsString(repunicode), repsize);
6288 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006289 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006290 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006291 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006292 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 /* need more space? (at least enough for what we
6294 have+the replacement+the rest of the string, so
6295 we won't have to check space for encodable characters) */
6296 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006297 repsize = PyUnicode_GET_LENGTH(repunicode);
6298 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 if (requiredsize > ressize) {
6300 if (requiredsize<2*ressize)
6301 requiredsize = 2*ressize;
6302 if (_PyBytes_Resize(&res, requiredsize)) {
6303 Py_DECREF(repunicode);
6304 goto onError;
6305 }
6306 str = PyBytes_AS_STRING(res) + respos;
6307 ressize = requiredsize;
6308 }
6309 /* check if there is anything unencodable in the replacement
6310 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006311 for (i = 0; repsize-->0; ++i, ++str) {
6312 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006314 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006315 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 Py_DECREF(repunicode);
6317 goto onError;
6318 }
6319 *str = (char)c;
6320 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006321 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006322 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006323 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006324 }
6325 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006326 /* Resize if we allocated to much */
6327 size = str - PyBytes_AS_STRING(res);
6328 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006329 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006330 if (_PyBytes_Resize(&res, size) < 0)
6331 goto onError;
6332 }
6333
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006334 Py_XDECREF(errorHandler);
6335 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006336 return res;
6337
6338 onError:
6339 Py_XDECREF(res);
6340 Py_XDECREF(errorHandler);
6341 Py_XDECREF(exc);
6342 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006343}
6344
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006345/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006346PyObject *
6347PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006348 Py_ssize_t size,
6349 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006351 PyObject *result;
6352 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6353 if (unicode == NULL)
6354 return NULL;
6355 result = unicode_encode_ucs1(unicode, errors, 256);
6356 Py_DECREF(unicode);
6357 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358}
6359
Alexander Belopolsky40018472011-02-26 01:02:56 +00006360PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006361_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362{
6363 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 PyErr_BadArgument();
6365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006367 if (PyUnicode_READY(unicode) == -1)
6368 return NULL;
6369 /* Fast path: if it is a one-byte string, construct
6370 bytes object directly. */
6371 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6372 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6373 PyUnicode_GET_LENGTH(unicode));
6374 /* Non-Latin-1 characters present. Defer to above function to
6375 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006376 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006377}
6378
6379PyObject*
6380PyUnicode_AsLatin1String(PyObject *unicode)
6381{
6382 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383}
6384
6385/* --- 7-bit ASCII Codec -------------------------------------------------- */
6386
Alexander Belopolsky40018472011-02-26 01:02:56 +00006387PyObject *
6388PyUnicode_DecodeASCII(const char *s,
6389 Py_ssize_t size,
6390 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006393 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006394 int kind;
6395 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006396 Py_ssize_t startinpos;
6397 Py_ssize_t endinpos;
6398 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 const char *e;
6400 PyObject *errorHandler = NULL;
6401 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006402
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006403 if (size == 0) {
6404 Py_INCREF(unicode_empty);
6405 return unicode_empty;
6406 }
6407
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006409 if (size == 1 && (unsigned char)s[0] < 128)
6410 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006411
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006412 unicode = PyUnicode_New(size, 127);
6413 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006417 data = PyUnicode_1BYTE_DATA(unicode);
6418 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6419 if (outpos == size)
6420 return unicode;
6421
6422 s += outpos;
6423 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 register unsigned char c = (unsigned char)*s;
6426 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006427 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 ++s;
6429 }
6430 else {
6431 startinpos = s-starts;
6432 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 if (unicode_decode_call_errorhandler(
6434 errors, &errorHandler,
6435 "ascii", "ordinal not in range(128)",
6436 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006437 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006439 kind = PyUnicode_KIND(unicode);
6440 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006443 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006444 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006445 Py_XDECREF(errorHandler);
6446 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006447 assert(_PyUnicode_CheckConsistency(unicode, 1));
6448 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006449
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006451 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452 Py_XDECREF(errorHandler);
6453 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 return NULL;
6455}
6456
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006457/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006458PyObject *
6459PyUnicode_EncodeASCII(const Py_UNICODE *p,
6460 Py_ssize_t size,
6461 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006463 PyObject *result;
6464 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6465 if (unicode == NULL)
6466 return NULL;
6467 result = unicode_encode_ucs1(unicode, errors, 128);
6468 Py_DECREF(unicode);
6469 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470}
6471
Alexander Belopolsky40018472011-02-26 01:02:56 +00006472PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006473_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474{
6475 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 PyErr_BadArgument();
6477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006479 if (PyUnicode_READY(unicode) == -1)
6480 return NULL;
6481 /* Fast path: if it is an ASCII-only string, construct bytes object
6482 directly. Else defer to above function to raise the exception. */
6483 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6484 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6485 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006486 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006487}
6488
6489PyObject *
6490PyUnicode_AsASCIIString(PyObject *unicode)
6491{
6492 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493}
6494
Victor Stinner99b95382011-07-04 14:23:54 +02006495#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006496
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006497/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006498
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006499#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006500#define NEED_RETRY
6501#endif
6502
Victor Stinner3a50e702011-10-18 21:21:00 +02006503#ifndef WC_ERR_INVALID_CHARS
6504# define WC_ERR_INVALID_CHARS 0x0080
6505#endif
6506
6507static char*
6508code_page_name(UINT code_page, PyObject **obj)
6509{
6510 *obj = NULL;
6511 if (code_page == CP_ACP)
6512 return "mbcs";
6513 if (code_page == CP_UTF7)
6514 return "CP_UTF7";
6515 if (code_page == CP_UTF8)
6516 return "CP_UTF8";
6517
6518 *obj = PyBytes_FromFormat("cp%u", code_page);
6519 if (*obj == NULL)
6520 return NULL;
6521 return PyBytes_AS_STRING(*obj);
6522}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006523
Alexander Belopolsky40018472011-02-26 01:02:56 +00006524static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006525is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006526{
6527 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006528 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006529
Victor Stinner3a50e702011-10-18 21:21:00 +02006530 if (!IsDBCSLeadByteEx(code_page, *curr))
6531 return 0;
6532
6533 prev = CharPrevExA(code_page, s, curr, 0);
6534 if (prev == curr)
6535 return 1;
6536 /* FIXME: This code is limited to "true" double-byte encodings,
6537 as it assumes an incomplete character consists of a single
6538 byte. */
6539 if (curr - prev == 2)
6540 return 1;
6541 if (!IsDBCSLeadByteEx(code_page, *prev))
6542 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006543 return 0;
6544}
6545
Victor Stinner3a50e702011-10-18 21:21:00 +02006546static DWORD
6547decode_code_page_flags(UINT code_page)
6548{
6549 if (code_page == CP_UTF7) {
6550 /* The CP_UTF7 decoder only supports flags=0 */
6551 return 0;
6552 }
6553 else
6554 return MB_ERR_INVALID_CHARS;
6555}
6556
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006557/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006558 * Decode a byte string from a Windows code page into unicode object in strict
6559 * mode.
6560 *
6561 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6562 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006563 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006564static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006565decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006566 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006567 const char *in,
6568 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006569{
Victor Stinner3a50e702011-10-18 21:21:00 +02006570 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006571 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006572 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006573
6574 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006575 assert(insize > 0);
6576 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6577 if (outsize <= 0)
6578 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006579
6580 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006582 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006583 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 if (*v == NULL)
6585 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006586 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006587 }
6588 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006590 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006591 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006593 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006594 }
6595
6596 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006597 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6598 if (outsize <= 0)
6599 goto error;
6600 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006601
Victor Stinner3a50e702011-10-18 21:21:00 +02006602error:
6603 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6604 return -2;
6605 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006606 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006607}
6608
Victor Stinner3a50e702011-10-18 21:21:00 +02006609/*
6610 * Decode a byte string from a code page into unicode object with an error
6611 * handler.
6612 *
6613 * Returns consumed size if succeed, or raise a WindowsError or
6614 * UnicodeDecodeError exception and returns -1 on error.
6615 */
6616static int
6617decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006618 PyObject **v,
6619 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006620 const char *errors)
6621{
6622 const char *startin = in;
6623 const char *endin = in + size;
6624 const DWORD flags = decode_code_page_flags(code_page);
6625 /* Ideally, we should get reason from FormatMessage. This is the Windows
6626 2000 English version of the message. */
6627 const char *reason = "No mapping for the Unicode character exists "
6628 "in the target code page.";
6629 /* each step cannot decode more than 1 character, but a character can be
6630 represented as a surrogate pair */
6631 wchar_t buffer[2], *startout, *out;
6632 int insize, outsize;
6633 PyObject *errorHandler = NULL;
6634 PyObject *exc = NULL;
6635 PyObject *encoding_obj = NULL;
6636 char *encoding;
6637 DWORD err;
6638 int ret = -1;
6639
6640 assert(size > 0);
6641
6642 encoding = code_page_name(code_page, &encoding_obj);
6643 if (encoding == NULL)
6644 return -1;
6645
6646 if (errors == NULL || strcmp(errors, "strict") == 0) {
6647 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6648 UnicodeDecodeError. */
6649 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6650 if (exc != NULL) {
6651 PyCodec_StrictErrors(exc);
6652 Py_CLEAR(exc);
6653 }
6654 goto error;
6655 }
6656
6657 if (*v == NULL) {
6658 /* Create unicode object */
6659 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6660 PyErr_NoMemory();
6661 goto error;
6662 }
Victor Stinnerab595942011-12-17 04:59:06 +01006663 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006664 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006665 if (*v == NULL)
6666 goto error;
6667 startout = PyUnicode_AS_UNICODE(*v);
6668 }
6669 else {
6670 /* Extend unicode object */
6671 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6672 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6673 PyErr_NoMemory();
6674 goto error;
6675 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006676 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006677 goto error;
6678 startout = PyUnicode_AS_UNICODE(*v) + n;
6679 }
6680
6681 /* Decode the byte string character per character */
6682 out = startout;
6683 while (in < endin)
6684 {
6685 /* Decode a character */
6686 insize = 1;
6687 do
6688 {
6689 outsize = MultiByteToWideChar(code_page, flags,
6690 in, insize,
6691 buffer, Py_ARRAY_LENGTH(buffer));
6692 if (outsize > 0)
6693 break;
6694 err = GetLastError();
6695 if (err != ERROR_NO_UNICODE_TRANSLATION
6696 && err != ERROR_INSUFFICIENT_BUFFER)
6697 {
6698 PyErr_SetFromWindowsErr(0);
6699 goto error;
6700 }
6701 insize++;
6702 }
6703 /* 4=maximum length of a UTF-8 sequence */
6704 while (insize <= 4 && (in + insize) <= endin);
6705
6706 if (outsize <= 0) {
6707 Py_ssize_t startinpos, endinpos, outpos;
6708
6709 startinpos = in - startin;
6710 endinpos = startinpos + 1;
6711 outpos = out - PyUnicode_AS_UNICODE(*v);
6712 if (unicode_decode_call_errorhandler(
6713 errors, &errorHandler,
6714 encoding, reason,
6715 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006716 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006717 {
6718 goto error;
6719 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006720 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006721 }
6722 else {
6723 in += insize;
6724 memcpy(out, buffer, outsize * sizeof(wchar_t));
6725 out += outsize;
6726 }
6727 }
6728
6729 /* write a NUL character at the end */
6730 *out = 0;
6731
6732 /* Extend unicode object */
6733 outsize = out - startout;
6734 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006735 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006736 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006737 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006738
6739error:
6740 Py_XDECREF(encoding_obj);
6741 Py_XDECREF(errorHandler);
6742 Py_XDECREF(exc);
6743 return ret;
6744}
6745
Victor Stinner3a50e702011-10-18 21:21:00 +02006746static PyObject *
6747decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006748 const char *s, Py_ssize_t size,
6749 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006750{
Victor Stinner76a31a62011-11-04 00:05:13 +01006751 PyObject *v = NULL;
6752 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006753
Victor Stinner3a50e702011-10-18 21:21:00 +02006754 if (code_page < 0) {
6755 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6756 return NULL;
6757 }
6758
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006759 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006761
Victor Stinner76a31a62011-11-04 00:05:13 +01006762 do
6763 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006764#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006765 if (size > INT_MAX) {
6766 chunk_size = INT_MAX;
6767 final = 0;
6768 done = 0;
6769 }
6770 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006771#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006772 {
6773 chunk_size = (int)size;
6774 final = (consumed == NULL);
6775 done = 1;
6776 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006777
Victor Stinner76a31a62011-11-04 00:05:13 +01006778 /* Skip trailing lead-byte unless 'final' is set */
6779 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6780 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006781
Victor Stinner76a31a62011-11-04 00:05:13 +01006782 if (chunk_size == 0 && done) {
6783 if (v != NULL)
6784 break;
6785 Py_INCREF(unicode_empty);
6786 return unicode_empty;
6787 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006788
Victor Stinner76a31a62011-11-04 00:05:13 +01006789
6790 converted = decode_code_page_strict(code_page, &v,
6791 s, chunk_size);
6792 if (converted == -2)
6793 converted = decode_code_page_errors(code_page, &v,
6794 s, chunk_size,
6795 errors);
6796 assert(converted != 0);
6797
6798 if (converted < 0) {
6799 Py_XDECREF(v);
6800 return NULL;
6801 }
6802
6803 if (consumed)
6804 *consumed += converted;
6805
6806 s += converted;
6807 size -= converted;
6808 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006809
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006810 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006811}
6812
Alexander Belopolsky40018472011-02-26 01:02:56 +00006813PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006814PyUnicode_DecodeCodePageStateful(int code_page,
6815 const char *s,
6816 Py_ssize_t size,
6817 const char *errors,
6818 Py_ssize_t *consumed)
6819{
6820 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6821}
6822
6823PyObject *
6824PyUnicode_DecodeMBCSStateful(const char *s,
6825 Py_ssize_t size,
6826 const char *errors,
6827 Py_ssize_t *consumed)
6828{
6829 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6830}
6831
6832PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006833PyUnicode_DecodeMBCS(const char *s,
6834 Py_ssize_t size,
6835 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006836{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006837 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6838}
6839
Victor Stinner3a50e702011-10-18 21:21:00 +02006840static DWORD
6841encode_code_page_flags(UINT code_page, const char *errors)
6842{
6843 if (code_page == CP_UTF8) {
6844 if (winver.dwMajorVersion >= 6)
6845 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6846 and later */
6847 return WC_ERR_INVALID_CHARS;
6848 else
6849 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6850 return 0;
6851 }
6852 else if (code_page == CP_UTF7) {
6853 /* CP_UTF7 only supports flags=0 */
6854 return 0;
6855 }
6856 else {
6857 if (errors != NULL && strcmp(errors, "replace") == 0)
6858 return 0;
6859 else
6860 return WC_NO_BEST_FIT_CHARS;
6861 }
6862}
6863
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006864/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006865 * Encode a Unicode string to a Windows code page into a byte string in strict
6866 * mode.
6867 *
6868 * Returns consumed characters if succeed, returns -2 on encode error, or raise
6869 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006870 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006871static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006872encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006873 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006874 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006875{
Victor Stinner554f3f02010-06-16 23:33:54 +00006876 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006877 BOOL *pusedDefaultChar = &usedDefaultChar;
6878 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006879 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006880 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006881 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006882 const DWORD flags = encode_code_page_flags(code_page, NULL);
6883 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006884 /* Create a substring so that we can get the UTF-16 representation
6885 of just the slice under consideration. */
6886 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006887
Martin v. Löwis3d325192011-11-04 18:23:06 +01006888 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006889
Victor Stinner3a50e702011-10-18 21:21:00 +02006890 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006891 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006893 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006894
Victor Stinner2fc507f2011-11-04 20:06:39 +01006895 substring = PyUnicode_Substring(unicode, offset, offset+len);
6896 if (substring == NULL)
6897 return -1;
6898 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6899 if (p == NULL) {
6900 Py_DECREF(substring);
6901 return -1;
6902 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006903
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006904 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006905 outsize = WideCharToMultiByte(code_page, flags,
6906 p, size,
6907 NULL, 0,
6908 NULL, pusedDefaultChar);
6909 if (outsize <= 0)
6910 goto error;
6911 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006912 if (pusedDefaultChar && *pusedDefaultChar) {
6913 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006914 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006915 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006916
Victor Stinner3a50e702011-10-18 21:21:00 +02006917 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006919 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006920 if (*outbytes == NULL) {
6921 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006923 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006924 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006925 }
6926 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006928 const Py_ssize_t n = PyBytes_Size(*outbytes);
6929 if (outsize > PY_SSIZE_T_MAX - n) {
6930 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006931 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006933 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006934 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6935 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006936 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006937 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006938 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006939 }
6940
6941 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006942 outsize = WideCharToMultiByte(code_page, flags,
6943 p, size,
6944 out, outsize,
6945 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006946 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006947 if (outsize <= 0)
6948 goto error;
6949 if (pusedDefaultChar && *pusedDefaultChar)
6950 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006951 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006952
Victor Stinner3a50e702011-10-18 21:21:00 +02006953error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006954 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006955 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6956 return -2;
6957 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006958 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006959}
6960
Victor Stinner3a50e702011-10-18 21:21:00 +02006961/*
6962 * Encode a Unicode string to a Windows code page into a byte string using a
6963 * error handler.
6964 *
6965 * Returns consumed characters if succeed, or raise a WindowsError and returns
6966 * -1 on other error.
6967 */
6968static int
6969encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01006970 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006971 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006972{
Victor Stinner3a50e702011-10-18 21:21:00 +02006973 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006974 Py_ssize_t pos = unicode_offset;
6975 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006976 /* Ideally, we should get reason from FormatMessage. This is the Windows
6977 2000 English version of the message. */
6978 const char *reason = "invalid character";
6979 /* 4=maximum length of a UTF-8 sequence */
6980 char buffer[4];
6981 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
6982 Py_ssize_t outsize;
6983 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006984 PyObject *errorHandler = NULL;
6985 PyObject *exc = NULL;
6986 PyObject *encoding_obj = NULL;
6987 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01006988 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006989 PyObject *rep;
6990 int ret = -1;
6991
6992 assert(insize > 0);
6993
6994 encoding = code_page_name(code_page, &encoding_obj);
6995 if (encoding == NULL)
6996 return -1;
6997
6998 if (errors == NULL || strcmp(errors, "strict") == 0) {
6999 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7000 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007001 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007002 if (exc != NULL) {
7003 PyCodec_StrictErrors(exc);
7004 Py_DECREF(exc);
7005 }
7006 Py_XDECREF(encoding_obj);
7007 return -1;
7008 }
7009
7010 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7011 pusedDefaultChar = &usedDefaultChar;
7012 else
7013 pusedDefaultChar = NULL;
7014
7015 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7016 PyErr_NoMemory();
7017 goto error;
7018 }
7019 outsize = insize * Py_ARRAY_LENGTH(buffer);
7020
7021 if (*outbytes == NULL) {
7022 /* Create string object */
7023 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7024 if (*outbytes == NULL)
7025 goto error;
7026 out = PyBytes_AS_STRING(*outbytes);
7027 }
7028 else {
7029 /* Extend string object */
7030 Py_ssize_t n = PyBytes_Size(*outbytes);
7031 if (n > PY_SSIZE_T_MAX - outsize) {
7032 PyErr_NoMemory();
7033 goto error;
7034 }
7035 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7036 goto error;
7037 out = PyBytes_AS_STRING(*outbytes) + n;
7038 }
7039
7040 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007041 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007042 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007043 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7044 wchar_t chars[2];
7045 int charsize;
7046 if (ch < 0x10000) {
7047 chars[0] = (wchar_t)ch;
7048 charsize = 1;
7049 }
7050 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007051 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7052 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007053 charsize = 2;
7054 }
7055
Victor Stinner3a50e702011-10-18 21:21:00 +02007056 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007057 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007058 buffer, Py_ARRAY_LENGTH(buffer),
7059 NULL, pusedDefaultChar);
7060 if (outsize > 0) {
7061 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7062 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007063 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007064 memcpy(out, buffer, outsize);
7065 out += outsize;
7066 continue;
7067 }
7068 }
7069 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7070 PyErr_SetFromWindowsErr(0);
7071 goto error;
7072 }
7073
Victor Stinner3a50e702011-10-18 21:21:00 +02007074 rep = unicode_encode_call_errorhandler(
7075 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007076 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007077 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007078 if (rep == NULL)
7079 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007080 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007081
7082 if (PyBytes_Check(rep)) {
7083 outsize = PyBytes_GET_SIZE(rep);
7084 if (outsize != 1) {
7085 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7086 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7087 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7088 Py_DECREF(rep);
7089 goto error;
7090 }
7091 out = PyBytes_AS_STRING(*outbytes) + offset;
7092 }
7093 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7094 out += outsize;
7095 }
7096 else {
7097 Py_ssize_t i;
7098 enum PyUnicode_Kind kind;
7099 void *data;
7100
Benjamin Petersonbac79492012-01-14 13:34:47 -05007101 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007102 Py_DECREF(rep);
7103 goto error;
7104 }
7105
7106 outsize = PyUnicode_GET_LENGTH(rep);
7107 if (outsize != 1) {
7108 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7109 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7110 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7111 Py_DECREF(rep);
7112 goto error;
7113 }
7114 out = PyBytes_AS_STRING(*outbytes) + offset;
7115 }
7116 kind = PyUnicode_KIND(rep);
7117 data = PyUnicode_DATA(rep);
7118 for (i=0; i < outsize; i++) {
7119 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7120 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007121 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007122 encoding, unicode,
7123 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 "unable to encode error handler result to ASCII");
7125 Py_DECREF(rep);
7126 goto error;
7127 }
7128 *out = (unsigned char)ch;
7129 out++;
7130 }
7131 }
7132 Py_DECREF(rep);
7133 }
7134 /* write a NUL byte */
7135 *out = 0;
7136 outsize = out - PyBytes_AS_STRING(*outbytes);
7137 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7138 if (_PyBytes_Resize(outbytes, outsize) < 0)
7139 goto error;
7140 ret = 0;
7141
7142error:
7143 Py_XDECREF(encoding_obj);
7144 Py_XDECREF(errorHandler);
7145 Py_XDECREF(exc);
7146 return ret;
7147}
7148
Victor Stinner3a50e702011-10-18 21:21:00 +02007149static PyObject *
7150encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007151 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007152 const char *errors)
7153{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007154 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007156 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007157 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007158
Benjamin Petersonbac79492012-01-14 13:34:47 -05007159 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007160 return NULL;
7161 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007162
Victor Stinner3a50e702011-10-18 21:21:00 +02007163 if (code_page < 0) {
7164 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7165 return NULL;
7166 }
7167
Martin v. Löwis3d325192011-11-04 18:23:06 +01007168 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007169 return PyBytes_FromStringAndSize(NULL, 0);
7170
Victor Stinner7581cef2011-11-03 22:32:33 +01007171 offset = 0;
7172 do
7173 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007174#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007175 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007176 chunks. */
7177 if (len > INT_MAX/2) {
7178 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007179 done = 0;
7180 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007181 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007182#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007183 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007184 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007185 done = 1;
7186 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007187
Victor Stinner76a31a62011-11-04 00:05:13 +01007188 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007189 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007190 errors);
7191 if (ret == -2)
7192 ret = encode_code_page_errors(code_page, &outbytes,
7193 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007194 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007195 if (ret < 0) {
7196 Py_XDECREF(outbytes);
7197 return NULL;
7198 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007199
Victor Stinner7581cef2011-11-03 22:32:33 +01007200 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007201 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007202 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007203
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 return outbytes;
7205}
7206
7207PyObject *
7208PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7209 Py_ssize_t size,
7210 const char *errors)
7211{
Victor Stinner7581cef2011-11-03 22:32:33 +01007212 PyObject *unicode, *res;
7213 unicode = PyUnicode_FromUnicode(p, size);
7214 if (unicode == NULL)
7215 return NULL;
7216 res = encode_code_page(CP_ACP, unicode, errors);
7217 Py_DECREF(unicode);
7218 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007219}
7220
7221PyObject *
7222PyUnicode_EncodeCodePage(int code_page,
7223 PyObject *unicode,
7224 const char *errors)
7225{
Victor Stinner7581cef2011-11-03 22:32:33 +01007226 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007227}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007228
Alexander Belopolsky40018472011-02-26 01:02:56 +00007229PyObject *
7230PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007231{
7232 if (!PyUnicode_Check(unicode)) {
7233 PyErr_BadArgument();
7234 return NULL;
7235 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007236 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007237}
7238
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007239#undef NEED_RETRY
7240
Victor Stinner99b95382011-07-04 14:23:54 +02007241#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007242
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243/* --- Character Mapping Codec -------------------------------------------- */
7244
Alexander Belopolsky40018472011-02-26 01:02:56 +00007245PyObject *
7246PyUnicode_DecodeCharmap(const char *s,
7247 Py_ssize_t size,
7248 PyObject *mapping,
7249 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007251 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007252 Py_ssize_t startinpos;
7253 Py_ssize_t endinpos;
7254 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007255 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007256 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007257 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007258 PyObject *errorHandler = NULL;
7259 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007260
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261 /* Default to Latin-1 */
7262 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007263 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007265 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007269 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007270 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007271 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007272 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007273 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007274 enum PyUnicode_Kind mapkind;
7275 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007276 Py_UCS4 x;
7277
Benjamin Petersonbac79492012-01-14 13:34:47 -05007278 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007279 return NULL;
7280
7281 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007282 mapdata = PyUnicode_DATA(mapping);
7283 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007284 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007285 unsigned char ch;
7286 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7287 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7288 if (outkind == PyUnicode_1BYTE_KIND) {
7289 void *outdata = PyUnicode_DATA(v);
7290 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7291 while (s < e) {
7292 unsigned char ch = *s;
7293 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7294 if (x > maxchar)
7295 goto Error;
7296 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7297 ++s;
7298 }
7299 break;
7300 }
7301 else if (outkind == PyUnicode_2BYTE_KIND) {
7302 void *outdata = PyUnicode_DATA(v);
7303 while (s < e) {
7304 unsigned char ch = *s;
7305 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7306 if (x == 0xFFFE)
7307 goto Error;
7308 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7309 ++s;
7310 }
7311 break;
7312 }
7313 }
7314 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007317 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007318 else
7319 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007320Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007321 if (x == 0xfffe)
7322 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007324 startinpos = s-starts;
7325 endinpos = startinpos+1;
7326 if (unicode_decode_call_errorhandler(
7327 errors, &errorHandler,
7328 "charmap", "character maps to <undefined>",
7329 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007330 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 goto onError;
7332 }
7333 continue;
7334 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007335
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007336 if (unicode_putchar(&v, &outpos, x) < 0)
7337 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007339 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007340 }
7341 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007342 while (s < e) {
7343 unsigned char ch = *s;
7344 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007345
Benjamin Peterson29060642009-01-31 22:14:21 +00007346 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7347 w = PyLong_FromLong((long)ch);
7348 if (w == NULL)
7349 goto onError;
7350 x = PyObject_GetItem(mapping, w);
7351 Py_DECREF(w);
7352 if (x == NULL) {
7353 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7354 /* No mapping found means: mapping is undefined. */
7355 PyErr_Clear();
7356 x = Py_None;
7357 Py_INCREF(x);
7358 } else
7359 goto onError;
7360 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007361
Benjamin Peterson29060642009-01-31 22:14:21 +00007362 /* Apply mapping */
7363 if (PyLong_Check(x)) {
7364 long value = PyLong_AS_LONG(x);
Antoine Pitroua1f76552012-09-23 20:00:04 +02007365 if (value < 0 || value > MAX_UNICODE) {
7366 PyErr_Format(PyExc_TypeError,
7367 "character mapping must be in range(0x%lx)",
7368 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 Py_DECREF(x);
7370 goto onError;
7371 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007372 if (unicode_putchar(&v, &outpos, value) < 0)
7373 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 }
7375 else if (x == Py_None) {
7376 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 startinpos = s-starts;
7378 endinpos = startinpos+1;
7379 if (unicode_decode_call_errorhandler(
7380 errors, &errorHandler,
7381 "charmap", "character maps to <undefined>",
7382 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007383 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 Py_DECREF(x);
7385 goto onError;
7386 }
7387 Py_DECREF(x);
7388 continue;
7389 }
7390 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007391 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007392
Benjamin Petersonbac79492012-01-14 13:34:47 -05007393 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007394 goto onError;
7395 targetsize = PyUnicode_GET_LENGTH(x);
7396
7397 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007399 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007400 PyUnicode_READ_CHAR(x, 0)) < 0)
7401 goto onError;
7402 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 else if (targetsize > 1) {
7404 /* 1-n mapping */
7405 if (targetsize > extrachars) {
7406 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 Py_ssize_t needed = (targetsize - extrachars) + \
7408 (targetsize << 2);
7409 extrachars += needed;
7410 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007411 if (unicode_resize(&v,
7412 PyUnicode_GET_LENGTH(v) + needed) < 0)
7413 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 Py_DECREF(x);
7415 goto onError;
7416 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007418 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007419 goto onError;
7420 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7421 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 extrachars -= targetsize;
7423 }
7424 /* 1-0 mapping: skip the character */
7425 }
7426 else {
7427 /* wrong return value */
7428 PyErr_SetString(PyExc_TypeError,
7429 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007430 Py_DECREF(x);
7431 goto onError;
7432 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 Py_DECREF(x);
7434 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007435 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007437 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007438 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007439 Py_XDECREF(errorHandler);
7440 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007441 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007442
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007444 Py_XDECREF(errorHandler);
7445 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 Py_XDECREF(v);
7447 return NULL;
7448}
7449
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007450/* Charmap encoding: the lookup table */
7451
Alexander Belopolsky40018472011-02-26 01:02:56 +00007452struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 PyObject_HEAD
7454 unsigned char level1[32];
7455 int count2, count3;
7456 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007457};
7458
7459static PyObject*
7460encoding_map_size(PyObject *obj, PyObject* args)
7461{
7462 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007463 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007465}
7466
7467static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007468 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 PyDoc_STR("Return the size (in bytes) of this object") },
7470 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007471};
7472
7473static void
7474encoding_map_dealloc(PyObject* o)
7475{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007476 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007477}
7478
7479static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007480 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 "EncodingMap", /*tp_name*/
7482 sizeof(struct encoding_map), /*tp_basicsize*/
7483 0, /*tp_itemsize*/
7484 /* methods */
7485 encoding_map_dealloc, /*tp_dealloc*/
7486 0, /*tp_print*/
7487 0, /*tp_getattr*/
7488 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007489 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 0, /*tp_repr*/
7491 0, /*tp_as_number*/
7492 0, /*tp_as_sequence*/
7493 0, /*tp_as_mapping*/
7494 0, /*tp_hash*/
7495 0, /*tp_call*/
7496 0, /*tp_str*/
7497 0, /*tp_getattro*/
7498 0, /*tp_setattro*/
7499 0, /*tp_as_buffer*/
7500 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7501 0, /*tp_doc*/
7502 0, /*tp_traverse*/
7503 0, /*tp_clear*/
7504 0, /*tp_richcompare*/
7505 0, /*tp_weaklistoffset*/
7506 0, /*tp_iter*/
7507 0, /*tp_iternext*/
7508 encoding_map_methods, /*tp_methods*/
7509 0, /*tp_members*/
7510 0, /*tp_getset*/
7511 0, /*tp_base*/
7512 0, /*tp_dict*/
7513 0, /*tp_descr_get*/
7514 0, /*tp_descr_set*/
7515 0, /*tp_dictoffset*/
7516 0, /*tp_init*/
7517 0, /*tp_alloc*/
7518 0, /*tp_new*/
7519 0, /*tp_free*/
7520 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007521};
7522
7523PyObject*
7524PyUnicode_BuildEncodingMap(PyObject* string)
7525{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007526 PyObject *result;
7527 struct encoding_map *mresult;
7528 int i;
7529 int need_dict = 0;
7530 unsigned char level1[32];
7531 unsigned char level2[512];
7532 unsigned char *mlevel1, *mlevel2, *mlevel3;
7533 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007534 int kind;
7535 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007536 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007537 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007538
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007539 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007540 PyErr_BadArgument();
7541 return NULL;
7542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007543 kind = PyUnicode_KIND(string);
7544 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007545 length = PyUnicode_GET_LENGTH(string);
7546 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007547 memset(level1, 0xFF, sizeof level1);
7548 memset(level2, 0xFF, sizeof level2);
7549
7550 /* If there isn't a one-to-one mapping of NULL to \0,
7551 or if there are non-BMP characters, we need to use
7552 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007553 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007554 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007555 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007556 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007557 ch = PyUnicode_READ(kind, data, i);
7558 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007559 need_dict = 1;
7560 break;
7561 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007562 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007563 /* unmapped character */
7564 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007565 l1 = ch >> 11;
7566 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007567 if (level1[l1] == 0xFF)
7568 level1[l1] = count2++;
7569 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007570 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007571 }
7572
7573 if (count2 >= 0xFF || count3 >= 0xFF)
7574 need_dict = 1;
7575
7576 if (need_dict) {
7577 PyObject *result = PyDict_New();
7578 PyObject *key, *value;
7579 if (!result)
7580 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007581 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007582 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007583 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007584 if (!key || !value)
7585 goto failed1;
7586 if (PyDict_SetItem(result, key, value) == -1)
7587 goto failed1;
7588 Py_DECREF(key);
7589 Py_DECREF(value);
7590 }
7591 return result;
7592 failed1:
7593 Py_XDECREF(key);
7594 Py_XDECREF(value);
7595 Py_DECREF(result);
7596 return NULL;
7597 }
7598
7599 /* Create a three-level trie */
7600 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7601 16*count2 + 128*count3 - 1);
7602 if (!result)
7603 return PyErr_NoMemory();
7604 PyObject_Init(result, &EncodingMapType);
7605 mresult = (struct encoding_map*)result;
7606 mresult->count2 = count2;
7607 mresult->count3 = count3;
7608 mlevel1 = mresult->level1;
7609 mlevel2 = mresult->level23;
7610 mlevel3 = mresult->level23 + 16*count2;
7611 memcpy(mlevel1, level1, 32);
7612 memset(mlevel2, 0xFF, 16*count2);
7613 memset(mlevel3, 0, 128*count3);
7614 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007615 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007616 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007617 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7618 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007619 /* unmapped character */
7620 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007621 o1 = ch>>11;
7622 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007623 i2 = 16*mlevel1[o1] + o2;
7624 if (mlevel2[i2] == 0xFF)
7625 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007626 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007627 i3 = 128*mlevel2[i2] + o3;
7628 mlevel3[i3] = i;
7629 }
7630 return result;
7631}
7632
7633static int
Victor Stinner22168992011-11-20 17:09:18 +01007634encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007635{
7636 struct encoding_map *map = (struct encoding_map*)mapping;
7637 int l1 = c>>11;
7638 int l2 = (c>>7) & 0xF;
7639 int l3 = c & 0x7F;
7640 int i;
7641
Victor Stinner22168992011-11-20 17:09:18 +01007642 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007643 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007644 if (c == 0)
7645 return 0;
7646 /* level 1*/
7647 i = map->level1[l1];
7648 if (i == 0xFF) {
7649 return -1;
7650 }
7651 /* level 2*/
7652 i = map->level23[16*i+l2];
7653 if (i == 0xFF) {
7654 return -1;
7655 }
7656 /* level 3 */
7657 i = map->level23[16*map->count2 + 128*i + l3];
7658 if (i == 0) {
7659 return -1;
7660 }
7661 return i;
7662}
7663
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007664/* Lookup the character ch in the mapping. If the character
7665 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007666 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007667static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007668charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669{
Christian Heimes217cfd12007-12-02 14:31:20 +00007670 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007671 PyObject *x;
7672
7673 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007675 x = PyObject_GetItem(mapping, w);
7676 Py_DECREF(w);
7677 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7679 /* No mapping found means: mapping is undefined. */
7680 PyErr_Clear();
7681 x = Py_None;
7682 Py_INCREF(x);
7683 return x;
7684 } else
7685 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007687 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007689 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007690 long value = PyLong_AS_LONG(x);
7691 if (value < 0 || value > 255) {
7692 PyErr_SetString(PyExc_TypeError,
7693 "character mapping must be in range(256)");
7694 Py_DECREF(x);
7695 return NULL;
7696 }
7697 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007699 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007700 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 /* wrong return value */
7703 PyErr_Format(PyExc_TypeError,
7704 "character mapping must return integer, bytes or None, not %.400s",
7705 x->ob_type->tp_name);
7706 Py_DECREF(x);
7707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708 }
7709}
7710
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007711static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007712charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007713{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007714 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7715 /* exponentially overallocate to minimize reallocations */
7716 if (requiredsize < 2*outsize)
7717 requiredsize = 2*outsize;
7718 if (_PyBytes_Resize(outobj, requiredsize))
7719 return -1;
7720 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007721}
7722
Benjamin Peterson14339b62009-01-31 16:36:08 +00007723typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007725} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007726/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007727 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007728 space is available. Return a new reference to the object that
7729 was put in the output buffer, or Py_None, if the mapping was undefined
7730 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007731 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007732static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007733charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007734 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007735{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007736 PyObject *rep;
7737 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007738 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007739
Christian Heimes90aa7642007-12-19 02:45:37 +00007740 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007741 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007743 if (res == -1)
7744 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 if (outsize<requiredsize)
7746 if (charmapencode_resize(outobj, outpos, requiredsize))
7747 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007748 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 outstart[(*outpos)++] = (char)res;
7750 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007751 }
7752
7753 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007754 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007756 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 Py_DECREF(rep);
7758 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007759 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 if (PyLong_Check(rep)) {
7761 Py_ssize_t requiredsize = *outpos+1;
7762 if (outsize<requiredsize)
7763 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7764 Py_DECREF(rep);
7765 return enc_EXCEPTION;
7766 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007767 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007768 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007769 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 else {
7771 const char *repchars = PyBytes_AS_STRING(rep);
7772 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7773 Py_ssize_t requiredsize = *outpos+repsize;
7774 if (outsize<requiredsize)
7775 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7776 Py_DECREF(rep);
7777 return enc_EXCEPTION;
7778 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007779 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 memcpy(outstart + *outpos, repchars, repsize);
7781 *outpos += repsize;
7782 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007783 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007784 Py_DECREF(rep);
7785 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007786}
7787
7788/* handle an error in PyUnicode_EncodeCharmap
7789 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007790static int
7791charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007792 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007793 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007794 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007795 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007796{
7797 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007798 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007799 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007800 enum PyUnicode_Kind kind;
7801 void *data;
7802 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007803 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007804 Py_ssize_t collstartpos = *inpos;
7805 Py_ssize_t collendpos = *inpos+1;
7806 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007807 char *encoding = "charmap";
7808 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007809 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007810 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007811 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812
Benjamin Petersonbac79492012-01-14 13:34:47 -05007813 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007814 return -1;
7815 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007816 /* find all unencodable characters */
7817 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007818 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007819 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007820 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007821 val = encoding_map_lookup(ch, mapping);
7822 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 break;
7824 ++collendpos;
7825 continue;
7826 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007827
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007828 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7829 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 if (rep==NULL)
7831 return -1;
7832 else if (rep!=Py_None) {
7833 Py_DECREF(rep);
7834 break;
7835 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007836 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007838 }
7839 /* cache callback name lookup
7840 * (if not done yet, i.e. it's the first error) */
7841 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 if ((errors==NULL) || (!strcmp(errors, "strict")))
7843 *known_errorHandler = 1;
7844 else if (!strcmp(errors, "replace"))
7845 *known_errorHandler = 2;
7846 else if (!strcmp(errors, "ignore"))
7847 *known_errorHandler = 3;
7848 else if (!strcmp(errors, "xmlcharrefreplace"))
7849 *known_errorHandler = 4;
7850 else
7851 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007852 }
7853 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007854 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007855 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007856 return -1;
7857 case 2: /* replace */
7858 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007859 x = charmapencode_output('?', mapping, res, respos);
7860 if (x==enc_EXCEPTION) {
7861 return -1;
7862 }
7863 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007864 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 return -1;
7866 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007867 }
7868 /* fall through */
7869 case 3: /* ignore */
7870 *inpos = collendpos;
7871 break;
7872 case 4: /* xmlcharrefreplace */
7873 /* generate replacement (temporarily (mis)uses p) */
7874 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 char buffer[2+29+1+1];
7876 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007877 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 for (cp = buffer; *cp; ++cp) {
7879 x = charmapencode_output(*cp, mapping, res, respos);
7880 if (x==enc_EXCEPTION)
7881 return -1;
7882 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007883 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 return -1;
7885 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007886 }
7887 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007888 *inpos = collendpos;
7889 break;
7890 default:
7891 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007892 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007894 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007896 if (PyBytes_Check(repunicode)) {
7897 /* Directly copy bytes result to output. */
7898 Py_ssize_t outsize = PyBytes_Size(*res);
7899 Py_ssize_t requiredsize;
7900 repsize = PyBytes_Size(repunicode);
7901 requiredsize = *respos + repsize;
7902 if (requiredsize > outsize)
7903 /* Make room for all additional bytes. */
7904 if (charmapencode_resize(res, respos, requiredsize)) {
7905 Py_DECREF(repunicode);
7906 return -1;
7907 }
7908 memcpy(PyBytes_AsString(*res) + *respos,
7909 PyBytes_AsString(repunicode), repsize);
7910 *respos += repsize;
7911 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007912 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007913 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007914 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007915 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007916 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007917 Py_DECREF(repunicode);
7918 return -1;
7919 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007920 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007921 data = PyUnicode_DATA(repunicode);
7922 kind = PyUnicode_KIND(repunicode);
7923 for (index = 0; index < repsize; index++) {
7924 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7925 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007927 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 return -1;
7929 }
7930 else if (x==enc_FAILED) {
7931 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007932 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 return -1;
7934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007935 }
7936 *inpos = newpos;
7937 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007938 }
7939 return 0;
7940}
7941
Alexander Belopolsky40018472011-02-26 01:02:56 +00007942PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007943_PyUnicode_EncodeCharmap(PyObject *unicode,
7944 PyObject *mapping,
7945 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007947 /* output object */
7948 PyObject *res = NULL;
7949 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007950 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007951 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007952 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007953 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007954 PyObject *errorHandler = NULL;
7955 PyObject *exc = NULL;
7956 /* the following variable is used for caching string comparisons
7957 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7958 * 3=ignore, 4=xmlcharrefreplace */
7959 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960
Benjamin Petersonbac79492012-01-14 13:34:47 -05007961 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007962 return NULL;
7963 size = PyUnicode_GET_LENGTH(unicode);
7964
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965 /* Default to Latin-1 */
7966 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007967 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007969 /* allocate enough for a simple encoding without
7970 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007971 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007972 if (res == NULL)
7973 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007974 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007977 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007978 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007980 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 if (x==enc_EXCEPTION) /* error */
7982 goto onError;
7983 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007984 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 &exc,
7986 &known_errorHandler, &errorHandler, errors,
7987 &res, &respos)) {
7988 goto onError;
7989 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007990 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 else
7992 /* done with this character => adjust input position */
7993 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007996 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007997 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007998 if (_PyBytes_Resize(&res, respos) < 0)
7999 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008000
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008001 Py_XDECREF(exc);
8002 Py_XDECREF(errorHandler);
8003 return res;
8004
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008006 Py_XDECREF(res);
8007 Py_XDECREF(exc);
8008 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 return NULL;
8010}
8011
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008012/* Deprecated */
8013PyObject *
8014PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8015 Py_ssize_t size,
8016 PyObject *mapping,
8017 const char *errors)
8018{
8019 PyObject *result;
8020 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8021 if (unicode == NULL)
8022 return NULL;
8023 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8024 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008025 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008026}
8027
Alexander Belopolsky40018472011-02-26 01:02:56 +00008028PyObject *
8029PyUnicode_AsCharmapString(PyObject *unicode,
8030 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031{
8032 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 PyErr_BadArgument();
8034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008036 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037}
8038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008039/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008040static void
8041make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008042 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008043 Py_ssize_t startpos, Py_ssize_t endpos,
8044 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008046 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008047 *exceptionObject = _PyUnicodeTranslateError_Create(
8048 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 }
8050 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8052 goto onError;
8053 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8054 goto onError;
8055 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8056 goto onError;
8057 return;
8058 onError:
8059 Py_DECREF(*exceptionObject);
8060 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061 }
8062}
8063
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008064/* error handling callback helper:
8065 build arguments, call the callback and check the arguments,
8066 put the result into newpos and return the replacement string, which
8067 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008068static PyObject *
8069unicode_translate_call_errorhandler(const char *errors,
8070 PyObject **errorHandler,
8071 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008072 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008073 Py_ssize_t startpos, Py_ssize_t endpos,
8074 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008076 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008077
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008078 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008079 PyObject *restuple;
8080 PyObject *resunicode;
8081
8082 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008086 }
8087
8088 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008089 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008090 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092
8093 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008098 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 Py_DECREF(restuple);
8100 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008101 }
8102 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 &resunicode, &i_newpos)) {
8104 Py_DECREF(restuple);
8105 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008106 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008107 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008109 else
8110 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008111 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8113 Py_DECREF(restuple);
8114 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008115 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008116 Py_INCREF(resunicode);
8117 Py_DECREF(restuple);
8118 return resunicode;
8119}
8120
8121/* Lookup the character ch in the mapping and put the result in result,
8122 which must be decrefed by the caller.
8123 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008124static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008125charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008126{
Christian Heimes217cfd12007-12-02 14:31:20 +00008127 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008128 PyObject *x;
8129
8130 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008132 x = PyObject_GetItem(mapping, w);
8133 Py_DECREF(w);
8134 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8136 /* No mapping found means: use 1:1 mapping. */
8137 PyErr_Clear();
8138 *result = NULL;
8139 return 0;
8140 } else
8141 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008142 }
8143 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 *result = x;
8145 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008147 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 long value = PyLong_AS_LONG(x);
8149 long max = PyUnicode_GetMax();
8150 if (value < 0 || value > max) {
8151 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008152 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 Py_DECREF(x);
8154 return -1;
8155 }
8156 *result = x;
8157 return 0;
8158 }
8159 else if (PyUnicode_Check(x)) {
8160 *result = x;
8161 return 0;
8162 }
8163 else {
8164 /* wrong return value */
8165 PyErr_SetString(PyExc_TypeError,
8166 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008167 Py_DECREF(x);
8168 return -1;
8169 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008170}
8171/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 if not reallocate and adjust various state variables.
8173 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008174static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008175charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008178 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008179 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008180 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 /* exponentially overallocate to minimize reallocations */
8182 if (requiredsize < 2 * oldsize)
8183 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008184 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8185 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008187 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008188 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008189 }
8190 return 0;
8191}
8192/* lookup the character, put the result in the output string and adjust
8193 various state variables. Return a new reference to the object that
8194 was put in the output buffer in *result, or Py_None, if the mapping was
8195 undefined (in which case no character was written).
8196 The called must decref result.
8197 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008198static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008199charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8200 PyObject *mapping, Py_UCS4 **output,
8201 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008202 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008204 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8205 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008207 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008209 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008210 }
8211 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008213 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008215 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008216 }
8217 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008218 Py_ssize_t repsize;
8219 if (PyUnicode_READY(*res) == -1)
8220 return -1;
8221 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 if (repsize==1) {
8223 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008224 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 }
8226 else if (repsize!=0) {
8227 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008228 Py_ssize_t requiredsize = *opos +
8229 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008231 Py_ssize_t i;
8232 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008234 for(i = 0; i < repsize; i++)
8235 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008237 }
8238 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240 return 0;
8241}
8242
Alexander Belopolsky40018472011-02-26 01:02:56 +00008243PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008244_PyUnicode_TranslateCharmap(PyObject *input,
8245 PyObject *mapping,
8246 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 /* input object */
8249 char *idata;
8250 Py_ssize_t size, i;
8251 int kind;
8252 /* output buffer */
8253 Py_UCS4 *output = NULL;
8254 Py_ssize_t osize;
8255 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008256 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008257 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 char *reason = "character maps to <undefined>";
8259 PyObject *errorHandler = NULL;
8260 PyObject *exc = NULL;
8261 /* the following variable is used for caching string comparisons
8262 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8263 * 3=ignore, 4=xmlcharrefreplace */
8264 int known_errorHandler = -1;
8265
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 PyErr_BadArgument();
8268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271 if (PyUnicode_READY(input) == -1)
8272 return NULL;
8273 idata = (char*)PyUnicode_DATA(input);
8274 kind = PyUnicode_KIND(input);
8275 size = PyUnicode_GET_LENGTH(input);
8276 i = 0;
8277
8278 if (size == 0) {
8279 Py_INCREF(input);
8280 return input;
8281 }
8282
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008283 /* allocate enough for a simple 1:1 translation without
8284 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008285 osize = size;
8286 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8287 opos = 0;
8288 if (output == NULL) {
8289 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008293 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 /* try to encode it */
8295 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008296 if (charmaptranslate_output(input, i, mapping,
8297 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 Py_XDECREF(x);
8299 goto onError;
8300 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008301 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008303 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 else { /* untranslatable character */
8305 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8306 Py_ssize_t repsize;
8307 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008308 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008310 Py_ssize_t collstart = i;
8311 Py_ssize_t collend = i+1;
8312 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008315 while (collend < size) {
8316 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 goto onError;
8318 Py_XDECREF(x);
8319 if (x!=Py_None)
8320 break;
8321 ++collend;
8322 }
8323 /* cache callback name lookup
8324 * (if not done yet, i.e. it's the first error) */
8325 if (known_errorHandler==-1) {
8326 if ((errors==NULL) || (!strcmp(errors, "strict")))
8327 known_errorHandler = 1;
8328 else if (!strcmp(errors, "replace"))
8329 known_errorHandler = 2;
8330 else if (!strcmp(errors, "ignore"))
8331 known_errorHandler = 3;
8332 else if (!strcmp(errors, "xmlcharrefreplace"))
8333 known_errorHandler = 4;
8334 else
8335 known_errorHandler = 0;
8336 }
8337 switch (known_errorHandler) {
8338 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008339 make_translate_exception(&exc,
8340 input, collstart, collend, reason);
8341 if (exc != NULL)
8342 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008343 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 case 2: /* replace */
8345 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008346 for (coll = collstart; coll<collend; coll++)
8347 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 /* fall through */
8349 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008350 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 break;
8352 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 /* generate replacement (temporarily (mis)uses i) */
8354 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 char buffer[2+29+1+1];
8356 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008357 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8358 if (charmaptranslate_makespace(&output, &osize,
8359 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 goto onError;
8361 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 break;
8366 default:
8367 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 reason, input, &exc,
8369 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008370 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008372 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008373 Py_DECREF(repunicode);
8374 goto onError;
8375 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 repsize = PyUnicode_GET_LENGTH(repunicode);
8378 if (charmaptranslate_makespace(&output, &osize,
8379 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 Py_DECREF(repunicode);
8381 goto onError;
8382 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 for (uni2 = 0; repsize-->0; ++uni2)
8384 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8385 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008387 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008388 }
8389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008390 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8391 if (!res)
8392 goto onError;
8393 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 Py_XDECREF(exc);
8395 Py_XDECREF(errorHandler);
8396 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008399 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400 Py_XDECREF(exc);
8401 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402 return NULL;
8403}
8404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008405/* Deprecated. Use PyUnicode_Translate instead. */
8406PyObject *
8407PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8408 Py_ssize_t size,
8409 PyObject *mapping,
8410 const char *errors)
8411{
Christian Heimes5f520f42012-09-11 14:03:25 +02008412 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008413 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8414 if (!unicode)
8415 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008416 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8417 Py_DECREF(unicode);
8418 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008419}
8420
Alexander Belopolsky40018472011-02-26 01:02:56 +00008421PyObject *
8422PyUnicode_Translate(PyObject *str,
8423 PyObject *mapping,
8424 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425{
8426 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008427
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428 str = PyUnicode_FromObject(str);
8429 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008430 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008431 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 Py_DECREF(str);
8433 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434}
Tim Petersced69f82003-09-16 20:30:58 +00008435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008436static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008437fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438{
8439 /* No need to call PyUnicode_READY(self) because this function is only
8440 called as a callback from fixup() which does it already. */
8441 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8442 const int kind = PyUnicode_KIND(self);
8443 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008444 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008445 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008446 Py_ssize_t i;
8447
8448 for (i = 0; i < len; ++i) {
8449 ch = PyUnicode_READ(kind, data, i);
8450 fixed = 0;
8451 if (ch > 127) {
8452 if (Py_UNICODE_ISSPACE(ch))
8453 fixed = ' ';
8454 else {
8455 const int decimal = Py_UNICODE_TODECIMAL(ch);
8456 if (decimal >= 0)
8457 fixed = '0' + decimal;
8458 }
8459 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008460 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008461 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462 PyUnicode_WRITE(kind, data, i, fixed);
8463 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008464 else
8465 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 }
8468
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008469 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470}
8471
8472PyObject *
8473_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8474{
8475 if (!PyUnicode_Check(unicode)) {
8476 PyErr_BadInternalCall();
8477 return NULL;
8478 }
8479 if (PyUnicode_READY(unicode) == -1)
8480 return NULL;
8481 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8482 /* If the string is already ASCII, just return the same string */
8483 Py_INCREF(unicode);
8484 return unicode;
8485 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008486 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487}
8488
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008489PyObject *
8490PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8491 Py_ssize_t length)
8492{
Victor Stinnerf0124502011-11-21 23:12:56 +01008493 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008494 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008495 Py_UCS4 maxchar;
8496 enum PyUnicode_Kind kind;
8497 void *data;
8498
Victor Stinner99d7ad02012-02-22 13:37:39 +01008499 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008500 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008501 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008502 if (ch > 127) {
8503 int decimal = Py_UNICODE_TODECIMAL(ch);
8504 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008505 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008506 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008507 }
8508 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008509
8510 /* Copy to a new string */
8511 decimal = PyUnicode_New(length, maxchar);
8512 if (decimal == NULL)
8513 return decimal;
8514 kind = PyUnicode_KIND(decimal);
8515 data = PyUnicode_DATA(decimal);
8516 /* Iterate over code points */
8517 for (i = 0; i < length; i++) {
8518 Py_UNICODE ch = s[i];
8519 if (ch > 127) {
8520 int decimal = Py_UNICODE_TODECIMAL(ch);
8521 if (decimal >= 0)
8522 ch = '0' + decimal;
8523 }
8524 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008526 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008527}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008528/* --- Decimal Encoder ---------------------------------------------------- */
8529
Alexander Belopolsky40018472011-02-26 01:02:56 +00008530int
8531PyUnicode_EncodeDecimal(Py_UNICODE *s,
8532 Py_ssize_t length,
8533 char *output,
8534 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008535{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008536 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008537 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008538 enum PyUnicode_Kind kind;
8539 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008540
8541 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 PyErr_BadArgument();
8543 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008544 }
8545
Victor Stinner42bf7752011-11-21 22:52:58 +01008546 unicode = PyUnicode_FromUnicode(s, length);
8547 if (unicode == NULL)
8548 return -1;
8549
Benjamin Petersonbac79492012-01-14 13:34:47 -05008550 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008551 Py_DECREF(unicode);
8552 return -1;
8553 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008554 kind = PyUnicode_KIND(unicode);
8555 data = PyUnicode_DATA(unicode);
8556
Victor Stinnerb84d7232011-11-22 01:50:07 +01008557 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008558 PyObject *exc;
8559 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008561 Py_ssize_t startpos;
8562
8563 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008564
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008566 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008567 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008569 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 decimal = Py_UNICODE_TODECIMAL(ch);
8571 if (decimal >= 0) {
8572 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008573 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 continue;
8575 }
8576 if (0 < ch && ch < 256) {
8577 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008578 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 continue;
8580 }
Victor Stinner6345be92011-11-25 20:09:01 +01008581
Victor Stinner42bf7752011-11-21 22:52:58 +01008582 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008583 exc = NULL;
8584 raise_encode_exception(&exc, "decimal", unicode,
8585 startpos, startpos+1,
8586 "invalid decimal Unicode string");
8587 Py_XDECREF(exc);
8588 Py_DECREF(unicode);
8589 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008590 }
8591 /* 0-terminate the output string */
8592 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008593 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008594 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008595}
8596
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597/* --- Helpers ------------------------------------------------------------ */
8598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008600any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 Py_ssize_t start,
8602 Py_ssize_t end)
8603{
8604 int kind1, kind2, kind;
8605 void *buf1, *buf2;
8606 Py_ssize_t len1, len2, result;
8607
8608 kind1 = PyUnicode_KIND(s1);
8609 kind2 = PyUnicode_KIND(s2);
8610 kind = kind1 > kind2 ? kind1 : kind2;
8611 buf1 = PyUnicode_DATA(s1);
8612 buf2 = PyUnicode_DATA(s2);
8613 if (kind1 != kind)
8614 buf1 = _PyUnicode_AsKind(s1, kind);
8615 if (!buf1)
8616 return -2;
8617 if (kind2 != kind)
8618 buf2 = _PyUnicode_AsKind(s2, kind);
8619 if (!buf2) {
8620 if (kind1 != kind) PyMem_Free(buf1);
8621 return -2;
8622 }
8623 len1 = PyUnicode_GET_LENGTH(s1);
8624 len2 = PyUnicode_GET_LENGTH(s2);
8625
Victor Stinner794d5672011-10-10 03:21:36 +02008626 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008627 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008628 case PyUnicode_1BYTE_KIND:
8629 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8630 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8631 else
8632 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8633 break;
8634 case PyUnicode_2BYTE_KIND:
8635 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8636 break;
8637 case PyUnicode_4BYTE_KIND:
8638 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8639 break;
8640 default:
8641 assert(0); result = -2;
8642 }
8643 }
8644 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008645 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008646 case PyUnicode_1BYTE_KIND:
8647 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8648 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8649 else
8650 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8651 break;
8652 case PyUnicode_2BYTE_KIND:
8653 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8654 break;
8655 case PyUnicode_4BYTE_KIND:
8656 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8657 break;
8658 default:
8659 assert(0); result = -2;
8660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 }
8662
8663 if (kind1 != kind)
8664 PyMem_Free(buf1);
8665 if (kind2 != kind)
8666 PyMem_Free(buf2);
8667
8668 return result;
8669}
8670
8671Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008672_PyUnicode_InsertThousandsGrouping(
8673 PyObject *unicode, Py_ssize_t index,
8674 Py_ssize_t n_buffer,
8675 void *digits, Py_ssize_t n_digits,
8676 Py_ssize_t min_width,
8677 const char *grouping, PyObject *thousands_sep,
8678 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679{
Victor Stinner41a863c2012-02-24 00:37:51 +01008680 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008681 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008682 Py_ssize_t thousands_sep_len;
8683 Py_ssize_t len;
8684
8685 if (unicode != NULL) {
8686 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008687 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008688 }
8689 else {
8690 kind = PyUnicode_1BYTE_KIND;
8691 data = NULL;
8692 }
8693 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8694 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8695 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8696 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008697 if (thousands_sep_kind < kind) {
8698 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8699 if (!thousands_sep_data)
8700 return -1;
8701 }
8702 else {
8703 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8704 if (!data)
8705 return -1;
8706 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008707 }
8708
Benjamin Petersonead6b532011-12-20 17:23:42 -06008709 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008711 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008712 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008713 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008714 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008715 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008716 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008717 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008718 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008719 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008720 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008721 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008722 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008723 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008724 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008725 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008726 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008727 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008729 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008730 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008731 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008732 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008733 break;
8734 default:
8735 assert(0);
8736 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008738 if (unicode != NULL && thousands_sep_kind != kind) {
8739 if (thousands_sep_kind < kind)
8740 PyMem_Free(thousands_sep_data);
8741 else
8742 PyMem_Free(data);
8743 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008744 if (unicode == NULL) {
8745 *maxchar = 127;
8746 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008747 *maxchar = MAX_MAXCHAR(*maxchar,
8748 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008749 }
8750 }
8751 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008752}
8753
8754
Thomas Wouters477c8d52006-05-27 19:21:47 +00008755/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008756#define ADJUST_INDICES(start, end, len) \
8757 if (end > len) \
8758 end = len; \
8759 else if (end < 0) { \
8760 end += len; \
8761 if (end < 0) \
8762 end = 0; \
8763 } \
8764 if (start < 0) { \
8765 start += len; \
8766 if (start < 0) \
8767 start = 0; \
8768 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008769
Alexander Belopolsky40018472011-02-26 01:02:56 +00008770Py_ssize_t
8771PyUnicode_Count(PyObject *str,
8772 PyObject *substr,
8773 Py_ssize_t start,
8774 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008776 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008777 PyObject* str_obj;
8778 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008779 int kind1, kind2, kind;
8780 void *buf1 = NULL, *buf2 = NULL;
8781 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008782
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008783 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008784 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008786 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008787 if (!sub_obj) {
8788 Py_DECREF(str_obj);
8789 return -1;
8790 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008791 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008792 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 Py_DECREF(str_obj);
8794 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795 }
Tim Petersced69f82003-09-16 20:30:58 +00008796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 kind1 = PyUnicode_KIND(str_obj);
8798 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008799 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008801 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008802 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008803 if (kind2 > kind) {
8804 Py_DECREF(sub_obj);
8805 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008806 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008807 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008808 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008809 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008810 if (!buf2)
8811 goto onError;
8812 len1 = PyUnicode_GET_LENGTH(str_obj);
8813 len2 = PyUnicode_GET_LENGTH(sub_obj);
8814
8815 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008816 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008818 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8819 result = asciilib_count(
8820 ((Py_UCS1*)buf1) + start, end - start,
8821 buf2, len2, PY_SSIZE_T_MAX
8822 );
8823 else
8824 result = ucs1lib_count(
8825 ((Py_UCS1*)buf1) + start, end - start,
8826 buf2, len2, PY_SSIZE_T_MAX
8827 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 break;
8829 case PyUnicode_2BYTE_KIND:
8830 result = ucs2lib_count(
8831 ((Py_UCS2*)buf1) + start, end - start,
8832 buf2, len2, PY_SSIZE_T_MAX
8833 );
8834 break;
8835 case PyUnicode_4BYTE_KIND:
8836 result = ucs4lib_count(
8837 ((Py_UCS4*)buf1) + start, end - start,
8838 buf2, len2, PY_SSIZE_T_MAX
8839 );
8840 break;
8841 default:
8842 assert(0); result = 0;
8843 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008844
8845 Py_DECREF(sub_obj);
8846 Py_DECREF(str_obj);
8847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 if (kind2 != kind)
8849 PyMem_Free(buf2);
8850
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 onError:
8853 Py_DECREF(sub_obj);
8854 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 if (kind2 != kind && buf2)
8856 PyMem_Free(buf2);
8857 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858}
8859
Alexander Belopolsky40018472011-02-26 01:02:56 +00008860Py_ssize_t
8861PyUnicode_Find(PyObject *str,
8862 PyObject *sub,
8863 Py_ssize_t start,
8864 Py_ssize_t end,
8865 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008867 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008868
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008870 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008871 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008872 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008873 if (!sub) {
8874 Py_DECREF(str);
8875 return -2;
8876 }
8877 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8878 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 Py_DECREF(str);
8880 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881 }
Tim Petersced69f82003-09-16 20:30:58 +00008882
Victor Stinner794d5672011-10-10 03:21:36 +02008883 result = any_find_slice(direction,
8884 str, sub, start, end
8885 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008886
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008888 Py_DECREF(sub);
8889
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890 return result;
8891}
8892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893Py_ssize_t
8894PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8895 Py_ssize_t start, Py_ssize_t end,
8896 int direction)
8897{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008899 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 if (PyUnicode_READY(str) == -1)
8901 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008902 if (start < 0 || end < 0) {
8903 PyErr_SetString(PyExc_IndexError, "string index out of range");
8904 return -2;
8905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 if (end > PyUnicode_GET_LENGTH(str))
8907 end = PyUnicode_GET_LENGTH(str);
8908 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008909 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8910 kind, end-start, ch, direction);
8911 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008912 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008913 else
8914 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915}
8916
Alexander Belopolsky40018472011-02-26 01:02:56 +00008917static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008918tailmatch(PyObject *self,
8919 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008920 Py_ssize_t start,
8921 Py_ssize_t end,
8922 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 int kind_self;
8925 int kind_sub;
8926 void *data_self;
8927 void *data_sub;
8928 Py_ssize_t offset;
8929 Py_ssize_t i;
8930 Py_ssize_t end_sub;
8931
8932 if (PyUnicode_READY(self) == -1 ||
8933 PyUnicode_READY(substring) == -1)
8934 return 0;
8935
8936 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937 return 1;
8938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8940 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 kind_self = PyUnicode_KIND(self);
8945 data_self = PyUnicode_DATA(self);
8946 kind_sub = PyUnicode_KIND(substring);
8947 data_sub = PyUnicode_DATA(substring);
8948 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8949
8950 if (direction > 0)
8951 offset = end;
8952 else
8953 offset = start;
8954
8955 if (PyUnicode_READ(kind_self, data_self, offset) ==
8956 PyUnicode_READ(kind_sub, data_sub, 0) &&
8957 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8958 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8959 /* If both are of the same kind, memcmp is sufficient */
8960 if (kind_self == kind_sub) {
8961 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008962 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963 data_sub,
8964 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008965 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 }
8967 /* otherwise we have to compare each character by first accesing it */
8968 else {
8969 /* We do not need to compare 0 and len(substring)-1 because
8970 the if statement above ensured already that they are equal
8971 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02008972 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 for (i = 1; i < end_sub; ++i) {
8974 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8975 PyUnicode_READ(kind_sub, data_sub, i))
8976 return 0;
8977 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 }
8981
8982 return 0;
8983}
8984
Alexander Belopolsky40018472011-02-26 01:02:56 +00008985Py_ssize_t
8986PyUnicode_Tailmatch(PyObject *str,
8987 PyObject *substr,
8988 Py_ssize_t start,
8989 Py_ssize_t end,
8990 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008992 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008993
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994 str = PyUnicode_FromObject(str);
8995 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008996 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997 substr = PyUnicode_FromObject(substr);
8998 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 Py_DECREF(str);
9000 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001 }
Tim Petersced69f82003-09-16 20:30:58 +00009002
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009003 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 Py_DECREF(str);
9006 Py_DECREF(substr);
9007 return result;
9008}
9009
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010/* Apply fixfct filter to the Unicode object self and return a
9011 reference to the modified object */
9012
Alexander Belopolsky40018472011-02-26 01:02:56 +00009013static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009014fixup(PyObject *self,
9015 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 PyObject *u;
9018 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009019 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009021 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009024 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 /* fix functions return the new maximum character in a string,
9027 if the kind of the resulting unicode object does not change,
9028 everything is fine. Otherwise we need to change the string kind
9029 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009030 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009031
9032 if (maxchar_new == 0) {
9033 /* no changes */;
9034 if (PyUnicode_CheckExact(self)) {
9035 Py_DECREF(u);
9036 Py_INCREF(self);
9037 return self;
9038 }
9039 else
9040 return u;
9041 }
9042
Victor Stinnere6abb482012-05-02 01:15:40 +02009043 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044
Victor Stinnereaab6042011-12-11 22:22:39 +01009045 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009047
9048 /* In case the maximum character changed, we need to
9049 convert the string to the new category. */
9050 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9051 if (v == NULL) {
9052 Py_DECREF(u);
9053 return NULL;
9054 }
9055 if (maxchar_new > maxchar_old) {
9056 /* If the maxchar increased so that the kind changed, not all
9057 characters are representable anymore and we need to fix the
9058 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009059 _PyUnicode_FastCopyCharacters(v, 0,
9060 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009061 maxchar_old = fixfct(v);
9062 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009063 }
9064 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009065 _PyUnicode_FastCopyCharacters(v, 0,
9066 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009068 Py_DECREF(u);
9069 assert(_PyUnicode_CheckConsistency(v, 1));
9070 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071}
9072
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009073static PyObject *
9074ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009076 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9077 char *resdata, *data = PyUnicode_DATA(self);
9078 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009079
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009080 res = PyUnicode_New(len, 127);
9081 if (res == NULL)
9082 return NULL;
9083 resdata = PyUnicode_DATA(res);
9084 if (lower)
9085 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009086 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009087 _Py_bytes_upper(resdata, data, len);
9088 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089}
9090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009092handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009094 Py_ssize_t j;
9095 int final_sigma;
9096 Py_UCS4 c;
9097 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009098
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009099 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9100
9101 where ! is a negation and \p{xxx} is a character with property xxx.
9102 */
9103 for (j = i - 1; j >= 0; j--) {
9104 c = PyUnicode_READ(kind, data, j);
9105 if (!_PyUnicode_IsCaseIgnorable(c))
9106 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009108 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9109 if (final_sigma) {
9110 for (j = i + 1; j < length; j++) {
9111 c = PyUnicode_READ(kind, data, j);
9112 if (!_PyUnicode_IsCaseIgnorable(c))
9113 break;
9114 }
9115 final_sigma = j == length || !_PyUnicode_IsCased(c);
9116 }
9117 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118}
9119
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009120static int
9121lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9122 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009124 /* Obscure special case. */
9125 if (c == 0x3A3) {
9126 mapped[0] = handle_capital_sigma(kind, data, length, i);
9127 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009129 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130}
9131
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009132static Py_ssize_t
9133do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009135 Py_ssize_t i, k = 0;
9136 int n_res, j;
9137 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009138
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009139 c = PyUnicode_READ(kind, data, 0);
9140 n_res = _PyUnicode_ToUpperFull(c, mapped);
9141 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009142 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009143 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009145 for (i = 1; i < length; i++) {
9146 c = PyUnicode_READ(kind, data, i);
9147 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9148 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009149 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009150 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009151 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009152 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009153 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154}
9155
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009156static Py_ssize_t
9157do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9158 Py_ssize_t i, k = 0;
9159
9160 for (i = 0; i < length; i++) {
9161 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9162 int n_res, j;
9163 if (Py_UNICODE_ISUPPER(c)) {
9164 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9165 }
9166 else if (Py_UNICODE_ISLOWER(c)) {
9167 n_res = _PyUnicode_ToUpperFull(c, mapped);
9168 }
9169 else {
9170 n_res = 1;
9171 mapped[0] = c;
9172 }
9173 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009174 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009175 res[k++] = mapped[j];
9176 }
9177 }
9178 return k;
9179}
9180
9181static Py_ssize_t
9182do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9183 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009185 Py_ssize_t i, k = 0;
9186
9187 for (i = 0; i < length; i++) {
9188 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9189 int n_res, j;
9190 if (lower)
9191 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9192 else
9193 n_res = _PyUnicode_ToUpperFull(c, mapped);
9194 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009195 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009196 res[k++] = mapped[j];
9197 }
9198 }
9199 return k;
9200}
9201
9202static Py_ssize_t
9203do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9204{
9205 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9206}
9207
9208static Py_ssize_t
9209do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9210{
9211 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9212}
9213
Benjamin Petersone51757f2012-01-12 21:10:29 -05009214static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009215do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9216{
9217 Py_ssize_t i, k = 0;
9218
9219 for (i = 0; i < length; i++) {
9220 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9221 Py_UCS4 mapped[3];
9222 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9223 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009224 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009225 res[k++] = mapped[j];
9226 }
9227 }
9228 return k;
9229}
9230
9231static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009232do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9233{
9234 Py_ssize_t i, k = 0;
9235 int previous_is_cased;
9236
9237 previous_is_cased = 0;
9238 for (i = 0; i < length; i++) {
9239 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9240 Py_UCS4 mapped[3];
9241 int n_res, j;
9242
9243 if (previous_is_cased)
9244 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9245 else
9246 n_res = _PyUnicode_ToTitleFull(c, mapped);
9247
9248 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009249 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009250 res[k++] = mapped[j];
9251 }
9252
9253 previous_is_cased = _PyUnicode_IsCased(c);
9254 }
9255 return k;
9256}
9257
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009258static PyObject *
9259case_operation(PyObject *self,
9260 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9261{
9262 PyObject *res = NULL;
9263 Py_ssize_t length, newlength = 0;
9264 int kind, outkind;
9265 void *data, *outdata;
9266 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9267
Benjamin Petersoneea48462012-01-16 14:28:50 -05009268 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009269
9270 kind = PyUnicode_KIND(self);
9271 data = PyUnicode_DATA(self);
9272 length = PyUnicode_GET_LENGTH(self);
9273 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9274 if (tmp == NULL)
9275 return PyErr_NoMemory();
9276 newlength = perform(kind, data, length, tmp, &maxchar);
9277 res = PyUnicode_New(newlength, maxchar);
9278 if (res == NULL)
9279 goto leave;
9280 tmpend = tmp + newlength;
9281 outdata = PyUnicode_DATA(res);
9282 outkind = PyUnicode_KIND(res);
9283 switch (outkind) {
9284 case PyUnicode_1BYTE_KIND:
9285 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9286 break;
9287 case PyUnicode_2BYTE_KIND:
9288 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9289 break;
9290 case PyUnicode_4BYTE_KIND:
9291 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9292 break;
9293 default:
9294 assert(0);
9295 break;
9296 }
9297 leave:
9298 PyMem_FREE(tmp);
9299 return res;
9300}
9301
Tim Peters8ce9f162004-08-27 01:49:32 +00009302PyObject *
9303PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009306 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009308 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009309 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9310 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009311 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009313 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009315 int use_memcpy;
9316 unsigned char *res_data = NULL, *sep_data = NULL;
9317 PyObject *last_obj;
9318 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319
Tim Peters05eba1f2004-08-27 21:32:02 +00009320 fseq = PySequence_Fast(seq, "");
9321 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009322 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009323 }
9324
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009325 /* NOTE: the following code can't call back into Python code,
9326 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009327 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009328
Tim Peters05eba1f2004-08-27 21:32:02 +00009329 seqlen = PySequence_Fast_GET_SIZE(fseq);
9330 /* If empty sequence, return u"". */
9331 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009332 Py_DECREF(fseq);
9333 Py_INCREF(unicode_empty);
9334 res = unicode_empty;
9335 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009336 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009337
Tim Peters05eba1f2004-08-27 21:32:02 +00009338 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009339 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009340 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009341 if (seqlen == 1) {
9342 if (PyUnicode_CheckExact(items[0])) {
9343 res = items[0];
9344 Py_INCREF(res);
9345 Py_DECREF(fseq);
9346 return res;
9347 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009348 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009349 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009350 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009351 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009352 /* Set up sep and seplen */
9353 if (separator == NULL) {
9354 /* fall back to a blank space separator */
9355 sep = PyUnicode_FromOrdinal(' ');
9356 if (!sep)
9357 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009358 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009359 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009360 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009361 else {
9362 if (!PyUnicode_Check(separator)) {
9363 PyErr_Format(PyExc_TypeError,
9364 "separator: expected str instance,"
9365 " %.80s found",
9366 Py_TYPE(separator)->tp_name);
9367 goto onError;
9368 }
9369 if (PyUnicode_READY(separator))
9370 goto onError;
9371 sep = separator;
9372 seplen = PyUnicode_GET_LENGTH(separator);
9373 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9374 /* inc refcount to keep this code path symmetric with the
9375 above case of a blank separator */
9376 Py_INCREF(sep);
9377 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009378 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009379 }
9380
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009381 /* There are at least two things to join, or else we have a subclass
9382 * of str in the sequence.
9383 * Do a pre-pass to figure out the total amount of space we'll
9384 * need (sz), and see whether all argument are strings.
9385 */
9386 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009387#ifdef Py_DEBUG
9388 use_memcpy = 0;
9389#else
9390 use_memcpy = 1;
9391#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009392 for (i = 0; i < seqlen; i++) {
9393 const Py_ssize_t old_sz = sz;
9394 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009395 if (!PyUnicode_Check(item)) {
9396 PyErr_Format(PyExc_TypeError,
9397 "sequence item %zd: expected str instance,"
9398 " %.80s found",
9399 i, Py_TYPE(item)->tp_name);
9400 goto onError;
9401 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 if (PyUnicode_READY(item) == -1)
9403 goto onError;
9404 sz += PyUnicode_GET_LENGTH(item);
9405 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009406 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009407 if (i != 0)
9408 sz += seplen;
9409 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9410 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009411 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009412 goto onError;
9413 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009414 if (use_memcpy && last_obj != NULL) {
9415 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9416 use_memcpy = 0;
9417 }
9418 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009419 }
Tim Petersced69f82003-09-16 20:30:58 +00009420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009422 if (res == NULL)
9423 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009424
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009425 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009426#ifdef Py_DEBUG
9427 use_memcpy = 0;
9428#else
9429 if (use_memcpy) {
9430 res_data = PyUnicode_1BYTE_DATA(res);
9431 kind = PyUnicode_KIND(res);
9432 if (seplen != 0)
9433 sep_data = PyUnicode_1BYTE_DATA(sep);
9434 }
9435#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009437 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009438 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009439 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009440 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009441 if (use_memcpy) {
9442 Py_MEMCPY(res_data,
9443 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009444 kind * seplen);
9445 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009446 }
9447 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009448 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009449 res_offset += seplen;
9450 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009451 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009452 itemlen = PyUnicode_GET_LENGTH(item);
9453 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009454 if (use_memcpy) {
9455 Py_MEMCPY(res_data,
9456 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009457 kind * itemlen);
9458 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009459 }
9460 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009461 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009462 res_offset += itemlen;
9463 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009464 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009465 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009466 if (use_memcpy)
9467 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009468 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009469 else
9470 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009471
Tim Peters05eba1f2004-08-27 21:32:02 +00009472 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009474 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476
Benjamin Peterson29060642009-01-31 22:14:21 +00009477 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009478 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009480 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481 return NULL;
9482}
9483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484#define FILL(kind, data, value, start, length) \
9485 do { \
9486 Py_ssize_t i_ = 0; \
9487 assert(kind != PyUnicode_WCHAR_KIND); \
9488 switch ((kind)) { \
9489 case PyUnicode_1BYTE_KIND: { \
9490 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009491 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 break; \
9493 } \
9494 case PyUnicode_2BYTE_KIND: { \
9495 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9496 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9497 break; \
9498 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009499 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9501 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9502 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009503 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 } \
9505 } \
9506 } while (0)
9507
Victor Stinnerd3f08822012-05-29 12:57:52 +02009508void
9509_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9510 Py_UCS4 fill_char)
9511{
9512 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9513 const void *data = PyUnicode_DATA(unicode);
9514 assert(PyUnicode_IS_READY(unicode));
9515 assert(unicode_modifiable(unicode));
9516 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9517 assert(start >= 0);
9518 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9519 FILL(kind, data, fill_char, start, length);
9520}
9521
Victor Stinner3fe55312012-01-04 00:33:50 +01009522Py_ssize_t
9523PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9524 Py_UCS4 fill_char)
9525{
9526 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009527
9528 if (!PyUnicode_Check(unicode)) {
9529 PyErr_BadInternalCall();
9530 return -1;
9531 }
9532 if (PyUnicode_READY(unicode) == -1)
9533 return -1;
9534 if (unicode_check_modifiable(unicode))
9535 return -1;
9536
Victor Stinnerd3f08822012-05-29 12:57:52 +02009537 if (start < 0) {
9538 PyErr_SetString(PyExc_IndexError, "string index out of range");
9539 return -1;
9540 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009541 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9542 PyErr_SetString(PyExc_ValueError,
9543 "fill character is bigger than "
9544 "the string maximum character");
9545 return -1;
9546 }
9547
9548 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9549 length = Py_MIN(maxlen, length);
9550 if (length <= 0)
9551 return 0;
9552
Victor Stinnerd3f08822012-05-29 12:57:52 +02009553 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009554 return length;
9555}
9556
Victor Stinner9310abb2011-10-05 00:59:23 +02009557static PyObject *
9558pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009559 Py_ssize_t left,
9560 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 PyObject *u;
9564 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009565 int kind;
9566 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567
9568 if (left < 0)
9569 left = 0;
9570 if (right < 0)
9571 right = 0;
9572
Victor Stinnerc4b49542011-12-11 22:44:26 +01009573 if (left == 0 && right == 0)
9574 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9577 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009578 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9579 return NULL;
9580 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009582 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009584 if (!u)
9585 return NULL;
9586
9587 kind = PyUnicode_KIND(u);
9588 data = PyUnicode_DATA(u);
9589 if (left)
9590 FILL(kind, data, fill, 0, left);
9591 if (right)
9592 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009593 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009594 assert(_PyUnicode_CheckConsistency(u, 1));
9595 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009596}
9597
Alexander Belopolsky40018472011-02-26 01:02:56 +00009598PyObject *
9599PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009600{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009601 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009602
9603 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009604 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009605 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009606 if (PyUnicode_READY(string) == -1) {
9607 Py_DECREF(string);
9608 return NULL;
9609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610
Benjamin Petersonead6b532011-12-20 17:23:42 -06009611 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009613 if (PyUnicode_IS_ASCII(string))
9614 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009615 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009616 PyUnicode_GET_LENGTH(string), keepends);
9617 else
9618 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009619 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009620 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621 break;
9622 case PyUnicode_2BYTE_KIND:
9623 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009624 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 PyUnicode_GET_LENGTH(string), keepends);
9626 break;
9627 case PyUnicode_4BYTE_KIND:
9628 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009629 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 PyUnicode_GET_LENGTH(string), keepends);
9631 break;
9632 default:
9633 assert(0);
9634 list = 0;
9635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636 Py_DECREF(string);
9637 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638}
9639
Alexander Belopolsky40018472011-02-26 01:02:56 +00009640static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009641split(PyObject *self,
9642 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009643 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 int kind1, kind2, kind;
9646 void *buf1, *buf2;
9647 Py_ssize_t len1, len2;
9648 PyObject* out;
9649
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009651 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 if (PyUnicode_READY(self) == -1)
9654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009657 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009659 if (PyUnicode_IS_ASCII(self))
9660 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009661 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009662 PyUnicode_GET_LENGTH(self), maxcount
9663 );
9664 else
9665 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009666 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009667 PyUnicode_GET_LENGTH(self), maxcount
9668 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 case PyUnicode_2BYTE_KIND:
9670 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009671 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 PyUnicode_GET_LENGTH(self), maxcount
9673 );
9674 case PyUnicode_4BYTE_KIND:
9675 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009676 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 PyUnicode_GET_LENGTH(self), maxcount
9678 );
9679 default:
9680 assert(0);
9681 return NULL;
9682 }
9683
9684 if (PyUnicode_READY(substring) == -1)
9685 return NULL;
9686
9687 kind1 = PyUnicode_KIND(self);
9688 kind2 = PyUnicode_KIND(substring);
9689 kind = kind1 > kind2 ? kind1 : kind2;
9690 buf1 = PyUnicode_DATA(self);
9691 buf2 = PyUnicode_DATA(substring);
9692 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009693 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 if (!buf1)
9695 return NULL;
9696 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009697 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 if (!buf2) {
9699 if (kind1 != kind) PyMem_Free(buf1);
9700 return NULL;
9701 }
9702 len1 = PyUnicode_GET_LENGTH(self);
9703 len2 = PyUnicode_GET_LENGTH(substring);
9704
Benjamin Petersonead6b532011-12-20 17:23:42 -06009705 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009707 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9708 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009709 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009710 else
9711 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009712 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 break;
9714 case PyUnicode_2BYTE_KIND:
9715 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009716 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 break;
9718 case PyUnicode_4BYTE_KIND:
9719 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009720 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 break;
9722 default:
9723 out = NULL;
9724 }
9725 if (kind1 != kind)
9726 PyMem_Free(buf1);
9727 if (kind2 != kind)
9728 PyMem_Free(buf2);
9729 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730}
9731
Alexander Belopolsky40018472011-02-26 01:02:56 +00009732static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009733rsplit(PyObject *self,
9734 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009735 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 int kind1, kind2, kind;
9738 void *buf1, *buf2;
9739 Py_ssize_t len1, len2;
9740 PyObject* out;
9741
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009742 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009743 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745 if (PyUnicode_READY(self) == -1)
9746 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009749 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009751 if (PyUnicode_IS_ASCII(self))
9752 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009753 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009754 PyUnicode_GET_LENGTH(self), maxcount
9755 );
9756 else
9757 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009758 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009759 PyUnicode_GET_LENGTH(self), maxcount
9760 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 case PyUnicode_2BYTE_KIND:
9762 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009763 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 PyUnicode_GET_LENGTH(self), maxcount
9765 );
9766 case PyUnicode_4BYTE_KIND:
9767 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009768 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 PyUnicode_GET_LENGTH(self), maxcount
9770 );
9771 default:
9772 assert(0);
9773 return NULL;
9774 }
9775
9776 if (PyUnicode_READY(substring) == -1)
9777 return NULL;
9778
9779 kind1 = PyUnicode_KIND(self);
9780 kind2 = PyUnicode_KIND(substring);
9781 kind = kind1 > kind2 ? kind1 : kind2;
9782 buf1 = PyUnicode_DATA(self);
9783 buf2 = PyUnicode_DATA(substring);
9784 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009785 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 if (!buf1)
9787 return NULL;
9788 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009789 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 if (!buf2) {
9791 if (kind1 != kind) PyMem_Free(buf1);
9792 return NULL;
9793 }
9794 len1 = PyUnicode_GET_LENGTH(self);
9795 len2 = PyUnicode_GET_LENGTH(substring);
9796
Benjamin Petersonead6b532011-12-20 17:23:42 -06009797 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009798 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009799 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9800 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009801 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009802 else
9803 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009804 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 break;
9806 case PyUnicode_2BYTE_KIND:
9807 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009808 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 break;
9810 case PyUnicode_4BYTE_KIND:
9811 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009812 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 break;
9814 default:
9815 out = NULL;
9816 }
9817 if (kind1 != kind)
9818 PyMem_Free(buf1);
9819 if (kind2 != kind)
9820 PyMem_Free(buf2);
9821 return out;
9822}
9823
9824static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009825anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9826 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009828 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009830 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9831 return asciilib_find(buf1, len1, buf2, len2, offset);
9832 else
9833 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 case PyUnicode_2BYTE_KIND:
9835 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9836 case PyUnicode_4BYTE_KIND:
9837 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9838 }
9839 assert(0);
9840 return -1;
9841}
9842
9843static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009844anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9845 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009847 switch (kind) {
9848 case PyUnicode_1BYTE_KIND:
9849 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9850 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9851 else
9852 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9853 case PyUnicode_2BYTE_KIND:
9854 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9855 case PyUnicode_4BYTE_KIND:
9856 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9857 }
9858 assert(0);
9859 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009860}
9861
Alexander Belopolsky40018472011-02-26 01:02:56 +00009862static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863replace(PyObject *self, PyObject *str1,
9864 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 PyObject *u;
9867 char *sbuf = PyUnicode_DATA(self);
9868 char *buf1 = PyUnicode_DATA(str1);
9869 char *buf2 = PyUnicode_DATA(str2);
9870 int srelease = 0, release1 = 0, release2 = 0;
9871 int skind = PyUnicode_KIND(self);
9872 int kind1 = PyUnicode_KIND(str1);
9873 int kind2 = PyUnicode_KIND(str2);
9874 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9875 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9876 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009877 int mayshrink;
9878 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879
9880 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009881 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009882 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009883 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884
Victor Stinner59de0ee2011-10-07 10:01:28 +02009885 if (str1 == str2)
9886 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 if (skind < kind1)
9888 /* substring too wide to be present */
9889 goto nothing;
9890
Victor Stinner49a0a212011-10-12 23:46:10 +02009891 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9892 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9893 /* Replacing str1 with str2 may cause a maxchar reduction in the
9894 result string. */
9895 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +02009896 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009899 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009901 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009903 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009904 Py_UCS4 u1, u2;
9905 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +01009906 Py_ssize_t index, pos;
9907 char *src;
9908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +01009910 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9911 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009912 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009915 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +02009917 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +01009919
9920 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9921 index = 0;
9922 src = sbuf;
9923 while (--maxcount)
9924 {
9925 pos++;
9926 src += pos * PyUnicode_KIND(self);
9927 slen -= pos;
9928 index += pos;
9929 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9930 if (pos < 0)
9931 break;
9932 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9933 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009934 }
9935 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 int rkind = skind;
9937 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009938 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 if (kind1 < rkind) {
9941 /* widen substring */
9942 buf1 = _PyUnicode_AsKind(str1, rkind);
9943 if (!buf1) goto error;
9944 release1 = 1;
9945 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009946 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009947 if (i < 0)
9948 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 if (rkind > kind2) {
9950 /* widen replacement */
9951 buf2 = _PyUnicode_AsKind(str2, rkind);
9952 if (!buf2) goto error;
9953 release2 = 1;
9954 }
9955 else if (rkind < kind2) {
9956 /* widen self and buf1 */
9957 rkind = kind2;
9958 if (release1) PyMem_Free(buf1);
9959 sbuf = _PyUnicode_AsKind(self, rkind);
9960 if (!sbuf) goto error;
9961 srelease = 1;
9962 buf1 = _PyUnicode_AsKind(str1, rkind);
9963 if (!buf1) goto error;
9964 release1 = 1;
9965 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009966 u = PyUnicode_New(slen, maxchar);
9967 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009969 assert(PyUnicode_KIND(u) == rkind);
9970 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009971
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009972 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009973 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009974 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009976 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009978
9979 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009980 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009981 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009982 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009983 if (i == -1)
9984 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009985 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009987 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009991 }
9992 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01009994 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 int rkind = skind;
9996 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009999 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 buf1 = _PyUnicode_AsKind(str1, rkind);
10001 if (!buf1) goto error;
10002 release1 = 1;
10003 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010004 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010005 if (n == 0)
10006 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010008 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 buf2 = _PyUnicode_AsKind(str2, rkind);
10010 if (!buf2) goto error;
10011 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010014 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 rkind = kind2;
10016 sbuf = _PyUnicode_AsKind(self, rkind);
10017 if (!sbuf) goto error;
10018 srelease = 1;
10019 if (release1) PyMem_Free(buf1);
10020 buf1 = _PyUnicode_AsKind(str1, rkind);
10021 if (!buf1) goto error;
10022 release1 = 1;
10023 }
10024 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10025 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010026 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 PyErr_SetString(PyExc_OverflowError,
10028 "replace string is too long");
10029 goto error;
10030 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010031 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010032 if (new_size == 0) {
10033 Py_INCREF(unicode_empty);
10034 u = unicode_empty;
10035 goto done;
10036 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010037 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 PyErr_SetString(PyExc_OverflowError,
10039 "replace string is too long");
10040 goto error;
10041 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010042 u = PyUnicode_New(new_size, maxchar);
10043 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010045 assert(PyUnicode_KIND(u) == rkind);
10046 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 ires = i = 0;
10048 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010049 while (n-- > 0) {
10050 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010051 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010052 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010053 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010054 if (j == -1)
10055 break;
10056 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010057 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010058 memcpy(res + rkind * ires,
10059 sbuf + rkind * i,
10060 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010062 }
10063 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010065 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010067 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010071 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010073 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010074 memcpy(res + rkind * ires,
10075 sbuf + rkind * i,
10076 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010077 }
10078 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010079 /* interleave */
10080 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010081 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010083 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010085 if (--n <= 0)
10086 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010087 memcpy(res + rkind * ires,
10088 sbuf + rkind * i,
10089 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 ires++;
10091 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010092 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010093 memcpy(res + rkind * ires,
10094 sbuf + rkind * i,
10095 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010096 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010097 }
10098
10099 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010100 unicode_adjust_maxchar(&u);
10101 if (u == NULL)
10102 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010103 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010104
10105 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 if (srelease)
10107 PyMem_FREE(sbuf);
10108 if (release1)
10109 PyMem_FREE(buf1);
10110 if (release2)
10111 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010112 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010114
Benjamin Peterson29060642009-01-31 22:14:21 +000010115 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010116 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 if (srelease)
10118 PyMem_FREE(sbuf);
10119 if (release1)
10120 PyMem_FREE(buf1);
10121 if (release2)
10122 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010123 return unicode_result_unchanged(self);
10124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 error:
10126 if (srelease && sbuf)
10127 PyMem_FREE(sbuf);
10128 if (release1 && buf1)
10129 PyMem_FREE(buf1);
10130 if (release2 && buf2)
10131 PyMem_FREE(buf2);
10132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133}
10134
10135/* --- Unicode Object Methods --------------------------------------------- */
10136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010137PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010138 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139\n\
10140Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010141characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142
10143static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010144unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010146 if (PyUnicode_READY(self) == -1)
10147 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010148 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149}
10150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010151PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010152 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153\n\
10154Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010155have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156
10157static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010158unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010160 if (PyUnicode_READY(self) == -1)
10161 return NULL;
10162 if (PyUnicode_GET_LENGTH(self) == 0)
10163 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010164 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165}
10166
Benjamin Petersond5890c82012-01-14 13:23:30 -050010167PyDoc_STRVAR(casefold__doc__,
10168 "S.casefold() -> str\n\
10169\n\
10170Return a version of S suitable for caseless comparisons.");
10171
10172static PyObject *
10173unicode_casefold(PyObject *self)
10174{
10175 if (PyUnicode_READY(self) == -1)
10176 return NULL;
10177 if (PyUnicode_IS_ASCII(self))
10178 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010179 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010180}
10181
10182
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010183/* Argument converter. Coerces to a single unicode character */
10184
10185static int
10186convert_uc(PyObject *obj, void *addr)
10187{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010189 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010190
Benjamin Peterson14339b62009-01-31 16:36:08 +000010191 uniobj = PyUnicode_FromObject(obj);
10192 if (uniobj == NULL) {
10193 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010194 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010195 return 0;
10196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010198 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010199 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010200 Py_DECREF(uniobj);
10201 return 0;
10202 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010204 Py_DECREF(uniobj);
10205 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010206}
10207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010208PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010209 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010211Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010212done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213
10214static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010215unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010217 Py_ssize_t marg, left;
10218 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 Py_UCS4 fillchar = ' ';
10220
Victor Stinnere9a29352011-10-01 02:14:59 +020010221 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223
Benjamin Petersonbac79492012-01-14 13:34:47 -050010224 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225 return NULL;
10226
Victor Stinnerc4b49542011-12-11 22:44:26 +010010227 if (PyUnicode_GET_LENGTH(self) >= width)
10228 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229
Victor Stinnerc4b49542011-12-11 22:44:26 +010010230 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231 left = marg / 2 + (marg & width & 1);
10232
Victor Stinner9310abb2011-10-05 00:59:23 +020010233 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234}
10235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236/* This function assumes that str1 and str2 are readied by the caller. */
10237
Marc-André Lemburge5034372000-08-08 08:04:29 +000010238static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010239unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010240{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 int kind1, kind2;
10242 void *data1, *data2;
Victor Stinner770e19e2012-10-04 22:59:45 +020010243 Py_ssize_t len1, len2;
10244 Py_ssize_t i, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010245
Victor Stinner90db9c42012-10-04 21:53:50 +020010246 /* a string is equal to itself */
10247 if (str1 == str2)
10248 return 0;
10249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 kind1 = PyUnicode_KIND(str1);
10251 kind2 = PyUnicode_KIND(str2);
10252 data1 = PyUnicode_DATA(str1);
10253 data2 = PyUnicode_DATA(str2);
10254 len1 = PyUnicode_GET_LENGTH(str1);
10255 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010256 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010257
Victor Stinner770e19e2012-10-04 22:59:45 +020010258 if (kind1 == 1 && kind2 == 1) {
10259 int cmp = memcmp(data1, data2, len);
10260 /* normalize result of memcmp() into the range [-1; 1] */
10261 if (cmp < 0)
10262 return -1;
10263 if (cmp > 0)
10264 return 1;
10265 }
10266 else {
10267 for (i = 0; i < len; ++i) {
10268 Py_UCS4 c1, c2;
10269 c1 = PyUnicode_READ(kind1, data1, i);
10270 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010271
Victor Stinner770e19e2012-10-04 22:59:45 +020010272 if (c1 != c2)
10273 return (c1 < c2) ? -1 : 1;
10274 }
Marc-André Lemburge5034372000-08-08 08:04:29 +000010275 }
10276
Victor Stinner770e19e2012-10-04 22:59:45 +020010277 if (len1 == len2)
10278 return 0;
10279 if (len1 < len2)
10280 return -1;
10281 else
10282 return 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010283}
10284
Victor Stinnere5567ad2012-10-23 02:48:49 +020010285static int
10286unicode_compare_eq(PyObject *str1, PyObject *str2)
10287{
10288 int kind;
10289 void *data1, *data2;
10290 Py_ssize_t len;
10291 int cmp;
10292
10293 /* a string is equal to itself */
10294 if (str1 == str2)
10295 return 1;
10296
10297 len = PyUnicode_GET_LENGTH(str1);
10298 if (PyUnicode_GET_LENGTH(str2) != len)
10299 return 0;
10300 kind = PyUnicode_KIND(str1);
10301 if (PyUnicode_KIND(str2) != kind)
10302 return 0;
10303 data1 = PyUnicode_DATA(str1);
10304 data2 = PyUnicode_DATA(str2);
10305
10306 cmp = memcmp(data1, data2, len * kind);
10307 return (cmp == 0);
10308}
10309
10310
Alexander Belopolsky40018472011-02-26 01:02:56 +000010311int
10312PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10315 if (PyUnicode_READY(left) == -1 ||
10316 PyUnicode_READY(right) == -1)
10317 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010318 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010320 PyErr_Format(PyExc_TypeError,
10321 "Can't compare %.100s and %.100s",
10322 left->ob_type->tp_name,
10323 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324 return -1;
10325}
10326
Martin v. Löwis5b222132007-06-10 09:51:05 +000010327int
10328PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10329{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 Py_ssize_t i;
10331 int kind;
10332 void *data;
10333 Py_UCS4 chr;
10334
Victor Stinner910337b2011-10-03 03:20:16 +020010335 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 if (PyUnicode_READY(uni) == -1)
10337 return -1;
10338 kind = PyUnicode_KIND(uni);
10339 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010340 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10342 if (chr != str[i])
10343 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010344 /* This check keeps Python strings that end in '\0' from comparing equal
10345 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010347 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010348 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010349 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010350 return 0;
10351}
10352
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010353
Benjamin Peterson29060642009-01-31 22:14:21 +000010354#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010355 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010356
Alexander Belopolsky40018472011-02-26 01:02:56 +000010357PyObject *
10358PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010359{
10360 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010361 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010362
Victor Stinnere5567ad2012-10-23 02:48:49 +020010363 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10364 Py_RETURN_NOTIMPLEMENTED;
10365
10366 if (PyUnicode_READY(left) == -1 ||
10367 PyUnicode_READY(right) == -1)
10368 return NULL;
10369
10370 if (op == Py_EQ || op == Py_NE) {
10371 result = unicode_compare_eq(left, right);
10372 if (op == Py_EQ)
10373 v = TEST_COND(result);
10374 else
10375 v = TEST_COND(!result);
10376 }
10377 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010378 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010379
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010380 /* Convert the return value to a Boolean */
10381 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010382 case Py_LE:
10383 v = TEST_COND(result <= 0);
10384 break;
10385 case Py_GE:
10386 v = TEST_COND(result >= 0);
10387 break;
10388 case Py_LT:
10389 v = TEST_COND(result == -1);
10390 break;
10391 case Py_GT:
10392 v = TEST_COND(result == 1);
10393 break;
10394 default:
10395 PyErr_BadArgument();
10396 return NULL;
10397 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010398 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010399 Py_INCREF(v);
10400 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010401}
10402
Alexander Belopolsky40018472011-02-26 01:02:56 +000010403int
10404PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010405{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010406 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 int kind1, kind2, kind;
10408 void *buf1, *buf2;
10409 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010410 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010411
10412 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010413 sub = PyUnicode_FromObject(element);
10414 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010415 PyErr_Format(PyExc_TypeError,
10416 "'in <string>' requires string as left operand, not %s",
10417 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010418 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010419 }
10420
Thomas Wouters477c8d52006-05-27 19:21:47 +000010421 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010422 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010423 Py_DECREF(sub);
10424 return -1;
10425 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010426 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10427 Py_DECREF(sub);
10428 Py_DECREF(str);
10429 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 kind1 = PyUnicode_KIND(str);
10432 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010433 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 buf1 = PyUnicode_DATA(str);
10435 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010436 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010437 if (kind2 > kind) {
10438 Py_DECREF(sub);
10439 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010440 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010441 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010442 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 if (!buf2) {
10445 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010446 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 return -1;
10448 }
10449 len1 = PyUnicode_GET_LENGTH(str);
10450 len2 = PyUnicode_GET_LENGTH(sub);
10451
Benjamin Petersonead6b532011-12-20 17:23:42 -060010452 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 case PyUnicode_1BYTE_KIND:
10454 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10455 break;
10456 case PyUnicode_2BYTE_KIND:
10457 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10458 break;
10459 case PyUnicode_4BYTE_KIND:
10460 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10461 break;
10462 default:
10463 result = -1;
10464 assert(0);
10465 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010466
10467 Py_DECREF(str);
10468 Py_DECREF(sub);
10469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 if (kind2 != kind)
10471 PyMem_Free(buf2);
10472
Guido van Rossum403d68b2000-03-13 15:55:09 +000010473 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010474}
10475
Guido van Rossumd57fd912000-03-10 22:53:23 +000010476/* Concat to string or Unicode object giving a new Unicode object. */
10477
Alexander Belopolsky40018472011-02-26 01:02:56 +000010478PyObject *
10479PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010482 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010483 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484
10485 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010488 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010491 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492
10493 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010494 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010495 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010498 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010499 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501 }
10502
Victor Stinner488fa492011-12-12 00:01:39 +010010503 u_len = PyUnicode_GET_LENGTH(u);
10504 v_len = PyUnicode_GET_LENGTH(v);
10505 if (u_len > PY_SSIZE_T_MAX - v_len) {
10506 PyErr_SetString(PyExc_OverflowError,
10507 "strings are too large to concat");
10508 goto onError;
10509 }
10510 new_len = u_len + v_len;
10511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010513 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010514 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010517 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010519 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010520 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10521 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522 Py_DECREF(u);
10523 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010524 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526
Benjamin Peterson29060642009-01-31 22:14:21 +000010527 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528 Py_XDECREF(u);
10529 Py_XDECREF(v);
10530 return NULL;
10531}
10532
Walter Dörwald1ab83302007-05-18 17:15:44 +000010533void
Victor Stinner23e56682011-10-03 03:54:37 +020010534PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010535{
Victor Stinner23e56682011-10-03 03:54:37 +020010536 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010537 Py_UCS4 maxchar, maxchar2;
10538 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010539
10540 if (p_left == NULL) {
10541 if (!PyErr_Occurred())
10542 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010543 return;
10544 }
Victor Stinner23e56682011-10-03 03:54:37 +020010545 left = *p_left;
10546 if (right == NULL || !PyUnicode_Check(left)) {
10547 if (!PyErr_Occurred())
10548 PyErr_BadInternalCall();
10549 goto error;
10550 }
10551
Benjamin Petersonbac79492012-01-14 13:34:47 -050010552 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010553 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010554 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010555 goto error;
10556
Victor Stinner488fa492011-12-12 00:01:39 +010010557 /* Shortcuts */
10558 if (left == unicode_empty) {
10559 Py_DECREF(left);
10560 Py_INCREF(right);
10561 *p_left = right;
10562 return;
10563 }
10564 if (right == unicode_empty)
10565 return;
10566
10567 left_len = PyUnicode_GET_LENGTH(left);
10568 right_len = PyUnicode_GET_LENGTH(right);
10569 if (left_len > PY_SSIZE_T_MAX - right_len) {
10570 PyErr_SetString(PyExc_OverflowError,
10571 "strings are too large to concat");
10572 goto error;
10573 }
10574 new_len = left_len + right_len;
10575
10576 if (unicode_modifiable(left)
10577 && PyUnicode_CheckExact(right)
10578 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010579 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10580 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010581 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010582 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010583 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10584 {
10585 /* append inplace */
10586 if (unicode_resize(p_left, new_len) != 0) {
10587 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10588 * deallocated so it cannot be put back into
10589 * 'variable'. The MemoryError is raised when there
10590 * is no value in 'variable', which might (very
10591 * remotely) be a cause of incompatibilities.
10592 */
10593 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010594 }
Victor Stinner488fa492011-12-12 00:01:39 +010010595 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010596 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010597 }
Victor Stinner488fa492011-12-12 00:01:39 +010010598 else {
10599 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10600 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010601 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010602
Victor Stinner488fa492011-12-12 00:01:39 +010010603 /* Concat the two Unicode strings */
10604 res = PyUnicode_New(new_len, maxchar);
10605 if (res == NULL)
10606 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010607 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10608 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010609 Py_DECREF(left);
10610 *p_left = res;
10611 }
10612 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010613 return;
10614
10615error:
Victor Stinner488fa492011-12-12 00:01:39 +010010616 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010617}
10618
10619void
10620PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10621{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010622 PyUnicode_Append(pleft, right);
10623 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010624}
10625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010626PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010627 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010630string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010631interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632
10633static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010634unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010636 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010637 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010638 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 int kind1, kind2, kind;
10641 void *buf1, *buf2;
10642 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643
Jesus Ceaac451502011-04-20 17:09:23 +020010644 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10645 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 kind1 = PyUnicode_KIND(self);
10649 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010650 if (kind2 > kind1)
10651 return PyLong_FromLong(0);
10652 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 buf1 = PyUnicode_DATA(self);
10654 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010656 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 if (!buf2) {
10658 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 return NULL;
10660 }
10661 len1 = PyUnicode_GET_LENGTH(self);
10662 len2 = PyUnicode_GET_LENGTH(substring);
10663
10664 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010665 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 case PyUnicode_1BYTE_KIND:
10667 iresult = ucs1lib_count(
10668 ((Py_UCS1*)buf1) + start, end - start,
10669 buf2, len2, PY_SSIZE_T_MAX
10670 );
10671 break;
10672 case PyUnicode_2BYTE_KIND:
10673 iresult = ucs2lib_count(
10674 ((Py_UCS2*)buf1) + start, end - start,
10675 buf2, len2, PY_SSIZE_T_MAX
10676 );
10677 break;
10678 case PyUnicode_4BYTE_KIND:
10679 iresult = ucs4lib_count(
10680 ((Py_UCS4*)buf1) + start, end - start,
10681 buf2, len2, PY_SSIZE_T_MAX
10682 );
10683 break;
10684 default:
10685 assert(0); iresult = 0;
10686 }
10687
10688 result = PyLong_FromSsize_t(iresult);
10689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (kind2 != kind)
10691 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692
10693 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010694
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695 return result;
10696}
10697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010698PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010699 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010701Encode S using the codec registered for encoding. Default encoding\n\
10702is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010703handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010704a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10705'xmlcharrefreplace' as well as any other name registered with\n\
10706codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707
10708static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010709unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010711 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712 char *encoding = NULL;
10713 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010714
Benjamin Peterson308d6372009-09-18 21:42:35 +000010715 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10716 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010718 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010719}
10720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010721PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723\n\
10724Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010725If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726
10727static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010728unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010730 Py_ssize_t i, j, line_pos, src_len, incr;
10731 Py_UCS4 ch;
10732 PyObject *u;
10733 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010735 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010736 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737
10738 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740
Antoine Pitrou22425222011-10-04 19:10:51 +020010741 if (PyUnicode_READY(self) == -1)
10742 return NULL;
10743
Thomas Wouters7e474022000-07-16 12:04:32 +000010744 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010745 src_len = PyUnicode_GET_LENGTH(self);
10746 i = j = line_pos = 0;
10747 kind = PyUnicode_KIND(self);
10748 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010749 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010750 for (; i < src_len; i++) {
10751 ch = PyUnicode_READ(kind, src_data, i);
10752 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010753 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010754 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010755 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010756 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010757 goto overflow;
10758 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010759 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010760 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010761 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010763 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010764 goto overflow;
10765 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010767 if (ch == '\n' || ch == '\r')
10768 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010770 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010771 if (!found)
10772 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010773
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010775 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776 if (!u)
10777 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010778 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779
Antoine Pitroue71d5742011-10-04 15:55:09 +020010780 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781
Antoine Pitroue71d5742011-10-04 15:55:09 +020010782 for (; i < src_len; i++) {
10783 ch = PyUnicode_READ(kind, src_data, i);
10784 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010785 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010786 incr = tabsize - (line_pos % tabsize);
10787 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010788 FILL(kind, dest_data, ' ', j, incr);
10789 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010790 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010791 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010792 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010793 line_pos++;
10794 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010795 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010796 if (ch == '\n' || ch == '\r')
10797 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010799 }
10800 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010801 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010802
Antoine Pitroue71d5742011-10-04 15:55:09 +020010803 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010804 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806}
10807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010808PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010809 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810\n\
10811Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010812such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813arguments start and end are interpreted as in slice notation.\n\
10814\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010815Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816
10817static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010820 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010821 Py_ssize_t start;
10822 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010823 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824
Jesus Ceaac451502011-04-20 17:09:23 +020010825 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10826 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 if (PyUnicode_READY(self) == -1)
10830 return NULL;
10831 if (PyUnicode_READY(substring) == -1)
10832 return NULL;
10833
Victor Stinner7931d9a2011-11-04 00:22:48 +010010834 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835
10836 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 if (result == -2)
10839 return NULL;
10840
Christian Heimes217cfd12007-12-02 14:31:20 +000010841 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842}
10843
10844static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010845unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010847 void *data;
10848 enum PyUnicode_Kind kind;
10849 Py_UCS4 ch;
10850 PyObject *res;
10851
10852 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10853 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010855 }
10856 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10857 PyErr_SetString(PyExc_IndexError, "string index out of range");
10858 return NULL;
10859 }
10860 kind = PyUnicode_KIND(self);
10861 data = PyUnicode_DATA(self);
10862 ch = PyUnicode_READ(kind, data, index);
10863 if (ch < 256)
10864 return get_latin1_char(ch);
10865
10866 res = PyUnicode_New(1, ch);
10867 if (res == NULL)
10868 return NULL;
10869 kind = PyUnicode_KIND(res);
10870 data = PyUnicode_DATA(res);
10871 PyUnicode_WRITE(kind, data, 0, ch);
10872 assert(_PyUnicode_CheckConsistency(res, 1));
10873 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874}
10875
Guido van Rossumc2504932007-09-18 19:42:40 +000010876/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010877 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010878static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010879unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880{
Guido van Rossumc2504932007-09-18 19:42:40 +000010881 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010882 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010883
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010884#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050010885 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010886#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010887 if (_PyUnicode_HASH(self) != -1)
10888 return _PyUnicode_HASH(self);
10889 if (PyUnicode_READY(self) == -1)
10890 return -1;
10891 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010010892 /*
10893 We make the hash of the empty string be 0, rather than using
10894 (prefix ^ suffix), since this slightly obfuscates the hash secret
10895 */
10896 if (len == 0) {
10897 _PyUnicode_HASH(self) = 0;
10898 return 0;
10899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900
10901 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010010902#define HASH(P) \
10903 x ^= (Py_uhash_t) *P << 7; \
10904 while (--len >= 0) \
10905 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906
Georg Brandl2fb477c2012-02-21 00:33:36 +010010907 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 switch (PyUnicode_KIND(self)) {
10909 case PyUnicode_1BYTE_KIND: {
10910 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10911 HASH(c);
10912 break;
10913 }
10914 case PyUnicode_2BYTE_KIND: {
10915 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10916 HASH(s);
10917 break;
10918 }
10919 default: {
10920 Py_UCS4 *l;
10921 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10922 "Impossible switch case in unicode_hash");
10923 l = PyUnicode_4BYTE_DATA(self);
10924 HASH(l);
10925 break;
10926 }
10927 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010010928 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
10929 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930
Guido van Rossumc2504932007-09-18 19:42:40 +000010931 if (x == -1)
10932 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010934 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010938PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010939 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010941Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942
10943static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010946 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010947 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010948 Py_ssize_t start;
10949 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950
Jesus Ceaac451502011-04-20 17:09:23 +020010951 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10952 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 if (PyUnicode_READY(self) == -1)
10956 return NULL;
10957 if (PyUnicode_READY(substring) == -1)
10958 return NULL;
10959
Victor Stinner7931d9a2011-11-04 00:22:48 +010010960 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961
10962 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 if (result == -2)
10965 return NULL;
10966
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967 if (result < 0) {
10968 PyErr_SetString(PyExc_ValueError, "substring not found");
10969 return NULL;
10970 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010971
Christian Heimes217cfd12007-12-02 14:31:20 +000010972 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973}
10974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010975PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010978Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010979at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980
10981static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010982unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 Py_ssize_t i, length;
10985 int kind;
10986 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987 int cased;
10988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 if (PyUnicode_READY(self) == -1)
10990 return NULL;
10991 length = PyUnicode_GET_LENGTH(self);
10992 kind = PyUnicode_KIND(self);
10993 data = PyUnicode_DATA(self);
10994
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 if (length == 1)
10997 return PyBool_FromLong(
10998 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011000 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011002 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011003
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 for (i = 0; i < length; i++) {
11006 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011007
Benjamin Peterson29060642009-01-31 22:14:21 +000011008 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11009 return PyBool_FromLong(0);
11010 else if (!cased && Py_UNICODE_ISLOWER(ch))
11011 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011013 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014}
11015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011016PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011017 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011019Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011020at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021
11022static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011023unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025 Py_ssize_t i, length;
11026 int kind;
11027 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028 int cased;
11029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 if (PyUnicode_READY(self) == -1)
11031 return NULL;
11032 length = PyUnicode_GET_LENGTH(self);
11033 kind = PyUnicode_KIND(self);
11034 data = PyUnicode_DATA(self);
11035
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 if (length == 1)
11038 return PyBool_FromLong(
11039 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011041 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011043 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011044
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046 for (i = 0; i < length; i++) {
11047 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011048
Benjamin Peterson29060642009-01-31 22:14:21 +000011049 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11050 return PyBool_FromLong(0);
11051 else if (!cased && Py_UNICODE_ISUPPER(ch))
11052 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011054 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055}
11056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011057PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011058 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011060Return True if S is a titlecased string and there is at least one\n\
11061character in S, i.e. upper- and titlecase characters may only\n\
11062follow uncased characters and lowercase characters only cased ones.\n\
11063Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
11065static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011066unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068 Py_ssize_t i, length;
11069 int kind;
11070 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071 int cased, previous_is_cased;
11072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 if (PyUnicode_READY(self) == -1)
11074 return NULL;
11075 length = PyUnicode_GET_LENGTH(self);
11076 kind = PyUnicode_KIND(self);
11077 data = PyUnicode_DATA(self);
11078
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 if (length == 1) {
11081 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11082 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11083 (Py_UNICODE_ISUPPER(ch) != 0));
11084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011086 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011088 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011089
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090 cased = 0;
11091 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 for (i = 0; i < length; i++) {
11093 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011094
Benjamin Peterson29060642009-01-31 22:14:21 +000011095 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11096 if (previous_is_cased)
11097 return PyBool_FromLong(0);
11098 previous_is_cased = 1;
11099 cased = 1;
11100 }
11101 else if (Py_UNICODE_ISLOWER(ch)) {
11102 if (!previous_is_cased)
11103 return PyBool_FromLong(0);
11104 previous_is_cased = 1;
11105 cased = 1;
11106 }
11107 else
11108 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011110 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011111}
11112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011113PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011114 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011116Return True if all characters in S are whitespace\n\
11117and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118
11119static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011120unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011122 Py_ssize_t i, length;
11123 int kind;
11124 void *data;
11125
11126 if (PyUnicode_READY(self) == -1)
11127 return NULL;
11128 length = PyUnicode_GET_LENGTH(self);
11129 kind = PyUnicode_KIND(self);
11130 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 if (length == 1)
11134 return PyBool_FromLong(
11135 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011137 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 for (i = 0; i < length; i++) {
11142 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011143 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011146 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147}
11148
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011149PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011150 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011151\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011152Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011153and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011154
11155static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011156unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011157{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 Py_ssize_t i, length;
11159 int kind;
11160 void *data;
11161
11162 if (PyUnicode_READY(self) == -1)
11163 return NULL;
11164 length = PyUnicode_GET_LENGTH(self);
11165 kind = PyUnicode_KIND(self);
11166 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011167
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011168 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 if (length == 1)
11170 return PyBool_FromLong(
11171 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011172
11173 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011175 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 for (i = 0; i < length; i++) {
11178 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011179 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011180 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011181 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011182}
11183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011184PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011186\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011187Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011188and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011189
11190static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011191unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 int kind;
11194 void *data;
11195 Py_ssize_t len, i;
11196
11197 if (PyUnicode_READY(self) == -1)
11198 return NULL;
11199
11200 kind = PyUnicode_KIND(self);
11201 data = PyUnicode_DATA(self);
11202 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011203
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011204 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 if (len == 1) {
11206 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11207 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11208 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011209
11210 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 for (i = 0; i < len; i++) {
11215 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011216 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011218 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011219 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011220}
11221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011222PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011223 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011225Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011226False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227
11228static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011229unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 Py_ssize_t i, length;
11232 int kind;
11233 void *data;
11234
11235 if (PyUnicode_READY(self) == -1)
11236 return NULL;
11237 length = PyUnicode_GET_LENGTH(self);
11238 kind = PyUnicode_KIND(self);
11239 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 if (length == 1)
11243 return PyBool_FromLong(
11244 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011246 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011247 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 for (i = 0; i < length; i++) {
11251 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011252 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011254 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255}
11256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011257PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011260Return True if all characters in S are digits\n\
11261and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262
11263static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011264unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266 Py_ssize_t i, length;
11267 int kind;
11268 void *data;
11269
11270 if (PyUnicode_READY(self) == -1)
11271 return NULL;
11272 length = PyUnicode_GET_LENGTH(self);
11273 kind = PyUnicode_KIND(self);
11274 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011277 if (length == 1) {
11278 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11279 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11280 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011282 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011284 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 for (i = 0; i < length; i++) {
11287 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011288 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011290 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291}
11292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011293PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011294 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011296Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011297False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298
11299static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011300unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 Py_ssize_t i, length;
11303 int kind;
11304 void *data;
11305
11306 if (PyUnicode_READY(self) == -1)
11307 return NULL;
11308 length = PyUnicode_GET_LENGTH(self);
11309 kind = PyUnicode_KIND(self);
11310 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 if (length == 1)
11314 return PyBool_FromLong(
11315 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011317 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011319 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 for (i = 0; i < length; i++) {
11322 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011323 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011325 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326}
11327
Martin v. Löwis47383402007-08-15 07:32:56 +000011328int
11329PyUnicode_IsIdentifier(PyObject *self)
11330{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 int kind;
11332 void *data;
11333 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011334 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 if (PyUnicode_READY(self) == -1) {
11337 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011338 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 }
11340
11341 /* Special case for empty strings */
11342 if (PyUnicode_GET_LENGTH(self) == 0)
11343 return 0;
11344 kind = PyUnicode_KIND(self);
11345 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011346
11347 /* PEP 3131 says that the first character must be in
11348 XID_Start and subsequent characters in XID_Continue,
11349 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011350 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011351 letters, digits, underscore). However, given the current
11352 definition of XID_Start and XID_Continue, it is sufficient
11353 to check just for these, except that _ must be allowed
11354 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011356 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011357 return 0;
11358
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011359 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011361 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011362 return 1;
11363}
11364
11365PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011367\n\
11368Return True if S is a valid identifier according\n\
11369to the language definition.");
11370
11371static PyObject*
11372unicode_isidentifier(PyObject *self)
11373{
11374 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11375}
11376
Georg Brandl559e5d72008-06-11 18:37:52 +000011377PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011378 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011379\n\
11380Return True if all characters in S are considered\n\
11381printable in repr() or S is empty, False otherwise.");
11382
11383static PyObject*
11384unicode_isprintable(PyObject *self)
11385{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 Py_ssize_t i, length;
11387 int kind;
11388 void *data;
11389
11390 if (PyUnicode_READY(self) == -1)
11391 return NULL;
11392 length = PyUnicode_GET_LENGTH(self);
11393 kind = PyUnicode_KIND(self);
11394 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011395
11396 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 if (length == 1)
11398 return PyBool_FromLong(
11399 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401 for (i = 0; i < length; i++) {
11402 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011403 Py_RETURN_FALSE;
11404 }
11405 }
11406 Py_RETURN_TRUE;
11407}
11408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011409PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011410 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411\n\
11412Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011413iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414
11415static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011416unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011418 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419}
11420
Martin v. Löwis18e16552006-02-15 17:27:45 +000011421static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011422unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 if (PyUnicode_READY(self) == -1)
11425 return -1;
11426 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427}
11428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011429PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011430 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011432Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011433done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434
11435static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011436unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011438 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 Py_UCS4 fillchar = ' ';
11440
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011441 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442 return NULL;
11443
Benjamin Petersonbac79492012-01-14 13:34:47 -050011444 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011445 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446
Victor Stinnerc4b49542011-12-11 22:44:26 +010011447 if (PyUnicode_GET_LENGTH(self) >= width)
11448 return unicode_result_unchanged(self);
11449
11450 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451}
11452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011453PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011454 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011456Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457
11458static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011459unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011461 if (PyUnicode_READY(self) == -1)
11462 return NULL;
11463 if (PyUnicode_IS_ASCII(self))
11464 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011465 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466}
11467
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011468#define LEFTSTRIP 0
11469#define RIGHTSTRIP 1
11470#define BOTHSTRIP 2
11471
11472/* Arrays indexed by above */
11473static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11474
11475#define STRIPNAME(i) (stripformat[i]+3)
11476
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011477/* externally visible for str.strip(unicode) */
11478PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011479_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 void *data;
11482 int kind;
11483 Py_ssize_t i, j, len;
11484 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11487 return NULL;
11488
11489 kind = PyUnicode_KIND(self);
11490 data = PyUnicode_DATA(self);
11491 len = PyUnicode_GET_LENGTH(self);
11492 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11493 PyUnicode_DATA(sepobj),
11494 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011495
Benjamin Peterson14339b62009-01-31 16:36:08 +000011496 i = 0;
11497 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 while (i < len &&
11499 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011500 i++;
11501 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011502 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011503
Benjamin Peterson14339b62009-01-31 16:36:08 +000011504 j = len;
11505 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 do {
11507 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 } while (j >= i &&
11509 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011511 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011512
Victor Stinner7931d9a2011-11-04 00:22:48 +010011513 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514}
11515
11516PyObject*
11517PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11518{
11519 unsigned char *data;
11520 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011521 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522
Victor Stinnerde636f32011-10-01 03:55:54 +020011523 if (PyUnicode_READY(self) == -1)
11524 return NULL;
11525
Victor Stinner684d5fd2012-05-03 02:32:34 +020011526 length = PyUnicode_GET_LENGTH(self);
11527 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011528
Victor Stinner684d5fd2012-05-03 02:32:34 +020011529 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011530 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531
Victor Stinnerde636f32011-10-01 03:55:54 +020011532 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011533 PyErr_SetString(PyExc_IndexError, "string index out of range");
11534 return NULL;
11535 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011536 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011537 Py_INCREF(unicode_empty);
11538 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011539 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011540
Victor Stinner684d5fd2012-05-03 02:32:34 +020011541 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011542 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011543 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011544 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011545 }
11546 else {
11547 kind = PyUnicode_KIND(self);
11548 data = PyUnicode_1BYTE_DATA(self);
11549 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011550 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011551 length);
11552 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554
11555static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011556do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 int kind;
11559 void *data;
11560 Py_ssize_t len, i, j;
11561
11562 if (PyUnicode_READY(self) == -1)
11563 return NULL;
11564
11565 kind = PyUnicode_KIND(self);
11566 data = PyUnicode_DATA(self);
11567 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011568
Benjamin Peterson14339b62009-01-31 16:36:08 +000011569 i = 0;
11570 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011572 i++;
11573 }
11574 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011575
Benjamin Peterson14339b62009-01-31 16:36:08 +000011576 j = len;
11577 if (striptype != LEFTSTRIP) {
11578 do {
11579 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011581 j++;
11582 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011583
Victor Stinner7931d9a2011-11-04 00:22:48 +010011584 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585}
11586
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011587
11588static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011589do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011590{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011591 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011592
Benjamin Peterson14339b62009-01-31 16:36:08 +000011593 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11594 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011595
Benjamin Peterson14339b62009-01-31 16:36:08 +000011596 if (sep != NULL && sep != Py_None) {
11597 if (PyUnicode_Check(sep))
11598 return _PyUnicode_XStrip(self, striptype, sep);
11599 else {
11600 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 "%s arg must be None or str",
11602 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011603 return NULL;
11604 }
11605 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011606
Benjamin Peterson14339b62009-01-31 16:36:08 +000011607 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011608}
11609
11610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011611PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011613\n\
11614Return a copy of the string S with leading and trailing\n\
11615whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011616If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011617
11618static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011619unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011620{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011621 if (PyTuple_GET_SIZE(args) == 0)
11622 return do_strip(self, BOTHSTRIP); /* Common case */
11623 else
11624 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011625}
11626
11627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011628PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011630\n\
11631Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011632If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011633
11634static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011635unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011636{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011637 if (PyTuple_GET_SIZE(args) == 0)
11638 return do_strip(self, LEFTSTRIP); /* Common case */
11639 else
11640 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011641}
11642
11643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011644PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011645 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011646\n\
11647Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011648If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011649
11650static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011651unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011652{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011653 if (PyTuple_GET_SIZE(args) == 0)
11654 return do_strip(self, RIGHTSTRIP); /* Common case */
11655 else
11656 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011657}
11658
11659
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011661unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011663 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665
Georg Brandl222de0f2009-04-12 12:01:50 +000011666 if (len < 1) {
11667 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011668 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011669 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670
Victor Stinnerc4b49542011-12-11 22:44:26 +010011671 /* no repeat, return original string */
11672 if (len == 1)
11673 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011674
Benjamin Petersonbac79492012-01-14 13:34:47 -050011675 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 return NULL;
11677
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011678 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011679 PyErr_SetString(PyExc_OverflowError,
11680 "repeated string is too long");
11681 return NULL;
11682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011684
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011685 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686 if (!u)
11687 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011688 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 if (PyUnicode_GET_LENGTH(str) == 1) {
11691 const int kind = PyUnicode_KIND(str);
11692 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011693 if (kind == PyUnicode_1BYTE_KIND) {
11694 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011695 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011696 }
11697 else if (kind == PyUnicode_2BYTE_KIND) {
11698 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011699 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011700 ucs2[n] = fill_char;
11701 } else {
11702 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11703 assert(kind == PyUnicode_4BYTE_KIND);
11704 for (n = 0; n < len; ++n)
11705 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 }
11708 else {
11709 /* number of characters copied this far */
11710 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011711 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 char *to = (char *) PyUnicode_DATA(u);
11713 Py_MEMCPY(to, PyUnicode_DATA(str),
11714 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 n = (done <= nchars-done) ? done : nchars-done;
11717 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011718 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720 }
11721
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011722 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011723 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724}
11725
Alexander Belopolsky40018472011-02-26 01:02:56 +000011726PyObject *
11727PyUnicode_Replace(PyObject *obj,
11728 PyObject *subobj,
11729 PyObject *replobj,
11730 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731{
11732 PyObject *self;
11733 PyObject *str1;
11734 PyObject *str2;
11735 PyObject *result;
11736
11737 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011738 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011741 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 Py_DECREF(self);
11743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744 }
11745 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011746 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 Py_DECREF(self);
11748 Py_DECREF(str1);
11749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011751 if (PyUnicode_READY(self) == -1 ||
11752 PyUnicode_READY(str1) == -1 ||
11753 PyUnicode_READY(str2) == -1)
11754 result = NULL;
11755 else
11756 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757 Py_DECREF(self);
11758 Py_DECREF(str1);
11759 Py_DECREF(str2);
11760 return result;
11761}
11762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011763PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011764 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765\n\
11766Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011767old replaced by new. If the optional argument count is\n\
11768given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769
11770static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 PyObject *str1;
11774 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011775 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 PyObject *result;
11777
Martin v. Löwis18e16552006-02-15 17:27:45 +000011778 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011780 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011783 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 return NULL;
11785 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011786 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 Py_DECREF(str1);
11788 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011789 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011790 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11791 result = NULL;
11792 else
11793 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794
11795 Py_DECREF(str1);
11796 Py_DECREF(str2);
11797 return result;
11798}
11799
Alexander Belopolsky40018472011-02-26 01:02:56 +000011800static PyObject *
11801unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011803 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 Py_ssize_t isize;
11805 Py_ssize_t osize, squote, dquote, i, o;
11806 Py_UCS4 max, quote;
11807 int ikind, okind;
11808 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011811 return NULL;
11812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 isize = PyUnicode_GET_LENGTH(unicode);
11814 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 /* Compute length of output, quote characters, and
11817 maximum character */
11818 osize = 2; /* quotes */
11819 max = 127;
11820 squote = dquote = 0;
11821 ikind = PyUnicode_KIND(unicode);
11822 for (i = 0; i < isize; i++) {
11823 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11824 switch (ch) {
11825 case '\'': squote++; osize++; break;
11826 case '"': dquote++; osize++; break;
11827 case '\\': case '\t': case '\r': case '\n':
11828 osize += 2; break;
11829 default:
11830 /* Fast-path ASCII */
11831 if (ch < ' ' || ch == 0x7f)
11832 osize += 4; /* \xHH */
11833 else if (ch < 0x7f)
11834 osize++;
11835 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11836 osize++;
11837 max = ch > max ? ch : max;
11838 }
11839 else if (ch < 0x100)
11840 osize += 4; /* \xHH */
11841 else if (ch < 0x10000)
11842 osize += 6; /* \uHHHH */
11843 else
11844 osize += 10; /* \uHHHHHHHH */
11845 }
11846 }
11847
11848 quote = '\'';
11849 if (squote) {
11850 if (dquote)
11851 /* Both squote and dquote present. Use squote,
11852 and escape them */
11853 osize += squote;
11854 else
11855 quote = '"';
11856 }
11857
11858 repr = PyUnicode_New(osize, max);
11859 if (repr == NULL)
11860 return NULL;
11861 okind = PyUnicode_KIND(repr);
11862 odata = PyUnicode_DATA(repr);
11863
11864 PyUnicode_WRITE(okind, odata, 0, quote);
11865 PyUnicode_WRITE(okind, odata, osize-1, quote);
11866
11867 for (i = 0, o = 1; i < isize; i++) {
11868 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011869
11870 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 if ((ch == quote) || (ch == '\\')) {
11872 PyUnicode_WRITE(okind, odata, o++, '\\');
11873 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011874 continue;
11875 }
11876
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011878 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 PyUnicode_WRITE(okind, odata, o++, '\\');
11880 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011881 }
11882 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 PyUnicode_WRITE(okind, odata, o++, '\\');
11884 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011885 }
11886 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 PyUnicode_WRITE(okind, odata, o++, '\\');
11888 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011889 }
11890
11891 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011892 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 PyUnicode_WRITE(okind, odata, o++, '\\');
11894 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011895 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11896 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011897 }
11898
Georg Brandl559e5d72008-06-11 18:37:52 +000011899 /* Copy ASCII characters as-is */
11900 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011902 }
11903
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011905 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011906 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011907 (categories Z* and C* except ASCII space)
11908 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011910 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000011911 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011914 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11915 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011916 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011917 /* Map 16-bit characters to '\uxxxx' */
11918 else if (ch <= 0xffff) {
11919 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011920 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11921 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11922 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11923 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011924 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011925 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011926 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011927 PyUnicode_WRITE(okind, odata, o++, 'U');
11928 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11929 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11930 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11931 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020011932 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11933 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11934 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11935 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011936 }
11937 }
11938 /* Copy characters as-is */
11939 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011941 }
11942 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011945 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011946 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947}
11948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011949PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951\n\
11952Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011953such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954arguments start and end are interpreted as in slice notation.\n\
11955\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011956Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957
11958static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011961 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011962 Py_ssize_t start;
11963 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011964 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965
Jesus Ceaac451502011-04-20 17:09:23 +020011966 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11967 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011968 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (PyUnicode_READY(self) == -1)
11971 return NULL;
11972 if (PyUnicode_READY(substring) == -1)
11973 return NULL;
11974
Victor Stinner7931d9a2011-11-04 00:22:48 +010011975 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976
11977 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 if (result == -2)
11980 return NULL;
11981
Christian Heimes217cfd12007-12-02 14:31:20 +000011982 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983}
11984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011985PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011988Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989
11990static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011993 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011994 Py_ssize_t start;
11995 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011996 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997
Jesus Ceaac451502011-04-20 17:09:23 +020011998 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11999 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012000 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 if (PyUnicode_READY(self) == -1)
12003 return NULL;
12004 if (PyUnicode_READY(substring) == -1)
12005 return NULL;
12006
Victor Stinner7931d9a2011-11-04 00:22:48 +010012007 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008
12009 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 if (result == -2)
12012 return NULL;
12013
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014 if (result < 0) {
12015 PyErr_SetString(PyExc_ValueError, "substring not found");
12016 return NULL;
12017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018
Christian Heimes217cfd12007-12-02 14:31:20 +000012019 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020}
12021
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012022PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012023 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012025Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012026done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027
12028static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012029unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012031 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 Py_UCS4 fillchar = ' ';
12033
Victor Stinnere9a29352011-10-01 02:14:59 +020012034 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012036
Benjamin Petersonbac79492012-01-14 13:34:47 -050012037 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038 return NULL;
12039
Victor Stinnerc4b49542011-12-11 22:44:26 +010012040 if (PyUnicode_GET_LENGTH(self) >= width)
12041 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042
Victor Stinnerc4b49542011-12-11 22:44:26 +010012043 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044}
12045
Alexander Belopolsky40018472011-02-26 01:02:56 +000012046PyObject *
12047PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048{
12049 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012050
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051 s = PyUnicode_FromObject(s);
12052 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012053 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012054 if (sep != NULL) {
12055 sep = PyUnicode_FromObject(sep);
12056 if (sep == NULL) {
12057 Py_DECREF(s);
12058 return NULL;
12059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060 }
12061
Victor Stinner9310abb2011-10-05 00:59:23 +020012062 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063
12064 Py_DECREF(s);
12065 Py_XDECREF(sep);
12066 return result;
12067}
12068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012069PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012070 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071\n\
12072Return a list of the words in S, using sep as the\n\
12073delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012074splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012075whitespace string is a separator and empty strings are\n\
12076removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077
12078static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012079unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012081 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012083 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012085 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12086 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087 return NULL;
12088
12089 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012090 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012092 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012094 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095}
12096
Thomas Wouters477c8d52006-05-27 19:21:47 +000012097PyObject *
12098PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12099{
12100 PyObject* str_obj;
12101 PyObject* sep_obj;
12102 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 int kind1, kind2, kind;
12104 void *buf1 = NULL, *buf2 = NULL;
12105 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012106
12107 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012108 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012110 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012111 if (!sep_obj) {
12112 Py_DECREF(str_obj);
12113 return NULL;
12114 }
12115 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12116 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012117 Py_DECREF(str_obj);
12118 return NULL;
12119 }
12120
Victor Stinner14f8f022011-10-05 20:58:25 +020012121 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012123 kind = Py_MAX(kind1, kind2);
12124 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012126 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 if (!buf1)
12128 goto onError;
12129 buf2 = PyUnicode_DATA(sep_obj);
12130 if (kind2 != kind)
12131 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12132 if (!buf2)
12133 goto onError;
12134 len1 = PyUnicode_GET_LENGTH(str_obj);
12135 len2 = PyUnicode_GET_LENGTH(sep_obj);
12136
Benjamin Petersonead6b532011-12-20 17:23:42 -060012137 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012139 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12140 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12141 else
12142 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 break;
12144 case PyUnicode_2BYTE_KIND:
12145 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12146 break;
12147 case PyUnicode_4BYTE_KIND:
12148 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12149 break;
12150 default:
12151 assert(0);
12152 out = 0;
12153 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012154
12155 Py_DECREF(sep_obj);
12156 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157 if (kind1 != kind)
12158 PyMem_Free(buf1);
12159 if (kind2 != kind)
12160 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012161
12162 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 onError:
12164 Py_DECREF(sep_obj);
12165 Py_DECREF(str_obj);
12166 if (kind1 != kind && buf1)
12167 PyMem_Free(buf1);
12168 if (kind2 != kind && buf2)
12169 PyMem_Free(buf2);
12170 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012171}
12172
12173
12174PyObject *
12175PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12176{
12177 PyObject* str_obj;
12178 PyObject* sep_obj;
12179 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180 int kind1, kind2, kind;
12181 void *buf1 = NULL, *buf2 = NULL;
12182 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012183
12184 str_obj = PyUnicode_FromObject(str_in);
12185 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012186 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012187 sep_obj = PyUnicode_FromObject(sep_in);
12188 if (!sep_obj) {
12189 Py_DECREF(str_obj);
12190 return NULL;
12191 }
12192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 kind1 = PyUnicode_KIND(str_in);
12194 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012195 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 buf1 = PyUnicode_DATA(str_in);
12197 if (kind1 != kind)
12198 buf1 = _PyUnicode_AsKind(str_in, kind);
12199 if (!buf1)
12200 goto onError;
12201 buf2 = PyUnicode_DATA(sep_obj);
12202 if (kind2 != kind)
12203 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12204 if (!buf2)
12205 goto onError;
12206 len1 = PyUnicode_GET_LENGTH(str_obj);
12207 len2 = PyUnicode_GET_LENGTH(sep_obj);
12208
Benjamin Petersonead6b532011-12-20 17:23:42 -060012209 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012211 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12212 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12213 else
12214 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 break;
12216 case PyUnicode_2BYTE_KIND:
12217 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12218 break;
12219 case PyUnicode_4BYTE_KIND:
12220 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12221 break;
12222 default:
12223 assert(0);
12224 out = 0;
12225 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012226
12227 Py_DECREF(sep_obj);
12228 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 if (kind1 != kind)
12230 PyMem_Free(buf1);
12231 if (kind2 != kind)
12232 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012233
12234 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235 onError:
12236 Py_DECREF(sep_obj);
12237 Py_DECREF(str_obj);
12238 if (kind1 != kind && buf1)
12239 PyMem_Free(buf1);
12240 if (kind2 != kind && buf2)
12241 PyMem_Free(buf2);
12242 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012243}
12244
12245PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012247\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012248Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012249the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012250found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012251
12252static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012253unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012254{
Victor Stinner9310abb2011-10-05 00:59:23 +020012255 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012256}
12257
12258PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012259 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012260\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012261Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012262the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012263separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012264
12265static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012266unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012267{
Victor Stinner9310abb2011-10-05 00:59:23 +020012268 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012269}
12270
Alexander Belopolsky40018472011-02-26 01:02:56 +000012271PyObject *
12272PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012273{
12274 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012275
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012276 s = PyUnicode_FromObject(s);
12277 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012278 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 if (sep != NULL) {
12280 sep = PyUnicode_FromObject(sep);
12281 if (sep == NULL) {
12282 Py_DECREF(s);
12283 return NULL;
12284 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012285 }
12286
Victor Stinner9310abb2011-10-05 00:59:23 +020012287 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012288
12289 Py_DECREF(s);
12290 Py_XDECREF(sep);
12291 return result;
12292}
12293
12294PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012295 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012296\n\
12297Return a list of the words in S, using sep as the\n\
12298delimiter string, starting at the end of the string and\n\
12299working to the front. If maxsplit is given, at most maxsplit\n\
12300splits are done. If sep is not specified, any whitespace string\n\
12301is a separator.");
12302
12303static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012304unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012305{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012306 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012307 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012308 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012309
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012310 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12311 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012312 return NULL;
12313
12314 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012315 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012316 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012317 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012318 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012319 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012320}
12321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012322PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324\n\
12325Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012326Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012327is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328
12329static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012330unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012332 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012333 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012335 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12336 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012337 return NULL;
12338
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012339 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340}
12341
12342static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012343PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012344{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012345 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346}
12347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012348PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012349 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350\n\
12351Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012352and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353
12354static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012355unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012357 if (PyUnicode_READY(self) == -1)
12358 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012359 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360}
12361
Georg Brandlceee0772007-11-27 23:48:05 +000012362PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012363 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012364\n\
12365Return a translation table usable for str.translate().\n\
12366If there is only one argument, it must be a dictionary mapping Unicode\n\
12367ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012368Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012369If there are two arguments, they must be strings of equal length, and\n\
12370in the resulting dictionary, each character in x will be mapped to the\n\
12371character at the same position in y. If there is a third argument, it\n\
12372must be a string, whose characters will be mapped to None in the result.");
12373
12374static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012375unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012376{
12377 PyObject *x, *y = NULL, *z = NULL;
12378 PyObject *new = NULL, *key, *value;
12379 Py_ssize_t i = 0;
12380 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012381
Georg Brandlceee0772007-11-27 23:48:05 +000012382 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12383 return NULL;
12384 new = PyDict_New();
12385 if (!new)
12386 return NULL;
12387 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 int x_kind, y_kind, z_kind;
12389 void *x_data, *y_data, *z_data;
12390
Georg Brandlceee0772007-11-27 23:48:05 +000012391 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012392 if (!PyUnicode_Check(x)) {
12393 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12394 "be a string if there is a second argument");
12395 goto err;
12396 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012398 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12399 "arguments must have equal length");
12400 goto err;
12401 }
12402 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403 x_kind = PyUnicode_KIND(x);
12404 y_kind = PyUnicode_KIND(y);
12405 x_data = PyUnicode_DATA(x);
12406 y_data = PyUnicode_DATA(y);
12407 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12408 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012409 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012410 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012411 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012412 if (!value) {
12413 Py_DECREF(key);
12414 goto err;
12415 }
Georg Brandlceee0772007-11-27 23:48:05 +000012416 res = PyDict_SetItem(new, key, value);
12417 Py_DECREF(key);
12418 Py_DECREF(value);
12419 if (res < 0)
12420 goto err;
12421 }
12422 /* create entries for deleting chars in z */
12423 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 z_kind = PyUnicode_KIND(z);
12425 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012426 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012428 if (!key)
12429 goto err;
12430 res = PyDict_SetItem(new, key, Py_None);
12431 Py_DECREF(key);
12432 if (res < 0)
12433 goto err;
12434 }
12435 }
12436 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 int kind;
12438 void *data;
12439
Georg Brandlceee0772007-11-27 23:48:05 +000012440 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012441 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012442 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12443 "to maketrans it must be a dict");
12444 goto err;
12445 }
12446 /* copy entries into the new dict, converting string keys to int keys */
12447 while (PyDict_Next(x, &i, &key, &value)) {
12448 if (PyUnicode_Check(key)) {
12449 /* convert string keys to integer keys */
12450 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012451 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012452 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12453 "table must be of length 1");
12454 goto err;
12455 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 kind = PyUnicode_KIND(key);
12457 data = PyUnicode_DATA(key);
12458 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012459 if (!newkey)
12460 goto err;
12461 res = PyDict_SetItem(new, newkey, value);
12462 Py_DECREF(newkey);
12463 if (res < 0)
12464 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012465 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012466 /* just keep integer keys */
12467 if (PyDict_SetItem(new, key, value) < 0)
12468 goto err;
12469 } else {
12470 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12471 "be strings or integers");
12472 goto err;
12473 }
12474 }
12475 }
12476 return new;
12477 err:
12478 Py_DECREF(new);
12479 return NULL;
12480}
12481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012482PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012483 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484\n\
12485Return a copy of the string S, where all characters have been mapped\n\
12486through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012487Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012488Unmapped characters are left untouched. Characters mapped to None\n\
12489are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
12491static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012495}
12496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012497PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012498 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012500Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501
12502static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012503unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012505 if (PyUnicode_READY(self) == -1)
12506 return NULL;
12507 if (PyUnicode_IS_ASCII(self))
12508 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012509 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510}
12511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012512PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012513 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012515Pad a numeric string S with zeros on the left, to fill a field\n\
12516of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517
12518static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012519unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012521 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012522 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012523 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 int kind;
12525 void *data;
12526 Py_UCS4 chr;
12527
Martin v. Löwis18e16552006-02-15 17:27:45 +000012528 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529 return NULL;
12530
Benjamin Petersonbac79492012-01-14 13:34:47 -050012531 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012532 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533
Victor Stinnerc4b49542011-12-11 22:44:26 +010012534 if (PyUnicode_GET_LENGTH(self) >= width)
12535 return unicode_result_unchanged(self);
12536
12537 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538
12539 u = pad(self, fill, 0, '0');
12540
Walter Dörwald068325e2002-04-15 13:36:47 +000012541 if (u == NULL)
12542 return NULL;
12543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 kind = PyUnicode_KIND(u);
12545 data = PyUnicode_DATA(u);
12546 chr = PyUnicode_READ(kind, data, fill);
12547
12548 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 PyUnicode_WRITE(kind, data, 0, chr);
12551 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552 }
12553
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012554 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012555 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557
12558#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012559static PyObject *
12560unicode__decimal2ascii(PyObject *self)
12561{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012562 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012563}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564#endif
12565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012566PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012567 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012569Return True if S starts with the specified prefix, False otherwise.\n\
12570With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012571With optional end, stop comparing S at that position.\n\
12572prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573
12574static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012575unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012578 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012579 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012580 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012581 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012582 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583
Jesus Ceaac451502011-04-20 17:09:23 +020012584 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012585 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012586 if (PyTuple_Check(subobj)) {
12587 Py_ssize_t i;
12588 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012589 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012590 if (substring == NULL)
12591 return NULL;
12592 result = tailmatch(self, substring, start, end, -1);
12593 Py_DECREF(substring);
12594 if (result) {
12595 Py_RETURN_TRUE;
12596 }
12597 }
12598 /* nothing matched */
12599 Py_RETURN_FALSE;
12600 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012601 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012602 if (substring == NULL) {
12603 if (PyErr_ExceptionMatches(PyExc_TypeError))
12604 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12605 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012606 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012607 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012608 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012610 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611}
12612
12613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012614PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012615 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012617Return True if S ends with the specified suffix, False otherwise.\n\
12618With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012619With optional end, stop comparing S at that position.\n\
12620suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621
12622static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012623unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012624 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012626 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012627 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012628 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012629 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012630 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631
Jesus Ceaac451502011-04-20 17:09:23 +020012632 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012633 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012634 if (PyTuple_Check(subobj)) {
12635 Py_ssize_t i;
12636 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012637 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012638 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012639 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012641 result = tailmatch(self, substring, start, end, +1);
12642 Py_DECREF(substring);
12643 if (result) {
12644 Py_RETURN_TRUE;
12645 }
12646 }
12647 Py_RETURN_FALSE;
12648 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012649 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012650 if (substring == NULL) {
12651 if (PyErr_ExceptionMatches(PyExc_TypeError))
12652 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12653 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012654 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012655 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012656 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012658 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659}
12660
Victor Stinner202fdca2012-05-07 12:47:02 +020012661Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012662_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012663{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012664 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012665 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12666 writer->data = PyUnicode_DATA(writer->buffer);
12667 writer->kind = PyUnicode_KIND(writer->buffer);
12668}
12669
Victor Stinnerd3f08822012-05-29 12:57:52 +020012670void
12671_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012672{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012673 memset(writer, 0, sizeof(*writer));
12674#ifdef Py_DEBUG
12675 writer->kind = 5; /* invalid kind */
12676#endif
12677 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012678 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012679}
12680
Victor Stinnerd3f08822012-05-29 12:57:52 +020012681int
12682_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12683 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012684{
12685 Py_ssize_t newlen;
12686 PyObject *newbuffer;
12687
Victor Stinnerd3f08822012-05-29 12:57:52 +020012688 assert(length > 0);
12689
Victor Stinner202fdca2012-05-07 12:47:02 +020012690 if (length > PY_SSIZE_T_MAX - writer->pos) {
12691 PyErr_NoMemory();
12692 return -1;
12693 }
12694 newlen = writer->pos + length;
12695
Victor Stinnerd3f08822012-05-29 12:57:52 +020012696 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012697 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012698 /* overallocate 25% to limit the number of resize */
12699 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12700 newlen += newlen / 4;
12701 if (newlen < writer->min_length)
12702 newlen = writer->min_length;
12703 }
12704 writer->buffer = PyUnicode_New(newlen, maxchar);
12705 if (writer->buffer == NULL)
12706 return -1;
12707 _PyUnicodeWriter_Update(writer);
12708 return 0;
12709 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012710
Victor Stinnerd3f08822012-05-29 12:57:52 +020012711 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012712 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012713 /* overallocate 25% to limit the number of resize */
12714 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12715 newlen += newlen / 4;
12716 if (newlen < writer->min_length)
12717 newlen = writer->min_length;
12718 }
12719
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012720 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012721 /* resize + widen */
12722 newbuffer = PyUnicode_New(newlen, maxchar);
12723 if (newbuffer == NULL)
12724 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012725 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12726 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012727 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012728 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012729 }
12730 else {
12731 newbuffer = resize_compact(writer->buffer, newlen);
12732 if (newbuffer == NULL)
12733 return -1;
12734 }
12735 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012736 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012737 }
12738 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012739 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012740 newbuffer = PyUnicode_New(writer->size, maxchar);
12741 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012742 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012743 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12744 writer->buffer, 0, writer->pos);
12745 Py_DECREF(writer->buffer);
12746 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012747 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012748 }
12749 return 0;
12750}
12751
Victor Stinnerd3f08822012-05-29 12:57:52 +020012752int
12753_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12754{
12755 Py_UCS4 maxchar;
12756 Py_ssize_t len;
12757
12758 if (PyUnicode_READY(str) == -1)
12759 return -1;
12760 len = PyUnicode_GET_LENGTH(str);
12761 if (len == 0)
12762 return 0;
12763 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12764 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012765 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012766 Py_INCREF(str);
12767 writer->buffer = str;
12768 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012769 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012770 writer->size = 0;
12771 writer->pos += len;
12772 return 0;
12773 }
12774 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12775 return -1;
12776 }
12777 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12778 str, 0, len);
12779 writer->pos += len;
12780 return 0;
12781}
12782
Victor Stinnere215d962012-10-06 23:03:36 +020012783int
12784_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12785{
12786 Py_UCS4 maxchar;
12787
12788 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12789 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12790 return -1;
12791 unicode_write_cstr(writer->buffer, writer->pos, str, len);
12792 writer->pos += len;
12793 return 0;
12794}
12795
Victor Stinnerd3f08822012-05-29 12:57:52 +020012796PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012797_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012798{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012799 if (writer->pos == 0) {
12800 Py_XDECREF(writer->buffer);
12801 Py_INCREF(unicode_empty);
12802 return unicode_empty;
12803 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012804 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012805 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12806 return writer->buffer;
12807 }
12808 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12809 PyObject *newbuffer;
12810 newbuffer = resize_compact(writer->buffer, writer->pos);
12811 if (newbuffer == NULL) {
12812 Py_DECREF(writer->buffer);
12813 return NULL;
12814 }
12815 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012816 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012817 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012818 return writer->buffer;
12819}
12820
Victor Stinnerd3f08822012-05-29 12:57:52 +020012821void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012822_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012823{
12824 Py_CLEAR(writer->buffer);
12825}
12826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012828
12829PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012830 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012831\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012832Return a formatted version of S, using substitutions from args and kwargs.\n\
12833The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012834
Eric Smith27bbca62010-11-04 17:06:58 +000012835PyDoc_STRVAR(format_map__doc__,
12836 "S.format_map(mapping) -> str\n\
12837\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012838Return a formatted version of S, using substitutions from mapping.\n\
12839The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012840
Eric Smith4a7d76d2008-05-30 18:10:19 +000012841static PyObject *
12842unicode__format__(PyObject* self, PyObject* args)
12843{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012844 PyObject *format_spec;
12845 _PyUnicodeWriter writer;
12846 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012847
12848 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12849 return NULL;
12850
Victor Stinnerd3f08822012-05-29 12:57:52 +020012851 if (PyUnicode_READY(self) == -1)
12852 return NULL;
12853 _PyUnicodeWriter_Init(&writer, 0);
12854 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12855 self, format_spec, 0,
12856 PyUnicode_GET_LENGTH(format_spec));
12857 if (ret == -1) {
12858 _PyUnicodeWriter_Dealloc(&writer);
12859 return NULL;
12860 }
12861 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012862}
12863
Eric Smith8c663262007-08-25 02:26:07 +000012864PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012865 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012866\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012867Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012868
12869static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012870unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012871{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872 Py_ssize_t size;
12873
12874 /* If it's a compact object, account for base structure +
12875 character data. */
12876 if (PyUnicode_IS_COMPACT_ASCII(v))
12877 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12878 else if (PyUnicode_IS_COMPACT(v))
12879 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012880 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881 else {
12882 /* If it is a two-block object, account for base object, and
12883 for character block if present. */
12884 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012885 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012886 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012887 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012888 }
12889 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012890 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012891 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012893 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012894 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012895
12896 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012897}
12898
12899PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012900 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012901
12902static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012903unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012904{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012905 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012906 if (!copy)
12907 return NULL;
12908 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012909}
12910
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012912 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012913 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012914 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12915 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012916 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12917 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050012918 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012919 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12920 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12921 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12922 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12923 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012924 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012925 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12926 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12927 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012928 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012929 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12930 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12931 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012932 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012933 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012934 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012935 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012936 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12937 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12938 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12939 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12940 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12941 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12942 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12943 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12944 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12945 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12946 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12947 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12948 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12949 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012950 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012951 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012952 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012953 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012954 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012955 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012956 {"maketrans", (PyCFunction) unicode_maketrans,
12957 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012958 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012959#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012960 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012961 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012962#endif
12963
Benjamin Peterson14339b62009-01-31 16:36:08 +000012964 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965 {NULL, NULL}
12966};
12967
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012968static PyObject *
12969unicode_mod(PyObject *v, PyObject *w)
12970{
Brian Curtindfc80e32011-08-10 20:28:54 -050012971 if (!PyUnicode_Check(v))
12972 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012973 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012974}
12975
12976static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012977 0, /*nb_add*/
12978 0, /*nb_subtract*/
12979 0, /*nb_multiply*/
12980 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012981};
12982
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012984 (lenfunc) unicode_length, /* sq_length */
12985 PyUnicode_Concat, /* sq_concat */
12986 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12987 (ssizeargfunc) unicode_getitem, /* sq_item */
12988 0, /* sq_slice */
12989 0, /* sq_ass_item */
12990 0, /* sq_ass_slice */
12991 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992};
12993
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012994static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012995unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 if (PyUnicode_READY(self) == -1)
12998 return NULL;
12999
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013000 if (PyIndex_Check(item)) {
13001 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013002 if (i == -1 && PyErr_Occurred())
13003 return NULL;
13004 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013006 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013007 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013008 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013009 PyObject *result;
13010 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013011 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013012 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013015 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013016 return NULL;
13017 }
13018
13019 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013020 Py_INCREF(unicode_empty);
13021 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013022 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013023 slicelength == PyUnicode_GET_LENGTH(self)) {
13024 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013025 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013026 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013027 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013028 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013029 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013030 src_kind = PyUnicode_KIND(self);
13031 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013032 if (!PyUnicode_IS_ASCII(self)) {
13033 kind_limit = kind_maxchar_limit(src_kind);
13034 max_char = 0;
13035 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13036 ch = PyUnicode_READ(src_kind, src_data, cur);
13037 if (ch > max_char) {
13038 max_char = ch;
13039 if (max_char >= kind_limit)
13040 break;
13041 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013042 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013043 }
Victor Stinner55c99112011-10-13 01:17:06 +020013044 else
13045 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013046 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013047 if (result == NULL)
13048 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013049 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013050 dest_data = PyUnicode_DATA(result);
13051
13052 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013053 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13054 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013055 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013056 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013057 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013058 } else {
13059 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13060 return NULL;
13061 }
13062}
13063
13064static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013065 (lenfunc)unicode_length, /* mp_length */
13066 (binaryfunc)unicode_subscript, /* mp_subscript */
13067 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013068};
13069
Guido van Rossumd57fd912000-03-10 22:53:23 +000013070
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071/* Helpers for PyUnicode_Format() */
13072
Victor Stinnera47082312012-10-04 02:19:54 +020013073struct unicode_formatter_t {
13074 PyObject *args;
13075 int args_owned;
13076 Py_ssize_t arglen, argidx;
13077 PyObject *dict;
13078
13079 enum PyUnicode_Kind fmtkind;
13080 Py_ssize_t fmtcnt, fmtpos;
13081 void *fmtdata;
13082 PyObject *fmtstr;
13083
13084 _PyUnicodeWriter writer;
13085};
13086
13087struct unicode_format_arg_t {
13088 Py_UCS4 ch;
13089 int flags;
13090 Py_ssize_t width;
13091 int prec;
13092 int sign;
13093};
13094
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013096unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097{
Victor Stinnera47082312012-10-04 02:19:54 +020013098 Py_ssize_t argidx = ctx->argidx;
13099
13100 if (argidx < ctx->arglen) {
13101 ctx->argidx++;
13102 if (ctx->arglen < 0)
13103 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013104 else
Victor Stinnera47082312012-10-04 02:19:54 +020013105 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106 }
13107 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013108 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109 return NULL;
13110}
13111
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013112/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113
Victor Stinnera47082312012-10-04 02:19:54 +020013114/* Format a float into the writer if the writer is not NULL, or into *p_output
13115 otherwise.
13116
13117 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013118static int
Victor Stinnera47082312012-10-04 02:19:54 +020013119formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13120 PyObject **p_output,
13121 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013123 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013125 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013126 int prec;
13127 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013128
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129 x = PyFloat_AsDouble(v);
13130 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013131 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013132
Victor Stinnera47082312012-10-04 02:19:54 +020013133 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013135 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013136
Victor Stinnera47082312012-10-04 02:19:54 +020013137 if (arg->flags & F_ALT)
13138 dtoa_flags = Py_DTSF_ALT;
13139 else
13140 dtoa_flags = 0;
13141 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013142 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013143 return -1;
13144 len = strlen(p);
13145 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013146 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13147 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013148 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013149 }
Victor Stinner184252a2012-06-16 02:57:41 +020013150 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013151 writer->pos += len;
13152 }
13153 else
13154 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013155 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013156 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157}
13158
Victor Stinnerd0880d52012-04-27 23:40:13 +020013159/* formatlong() emulates the format codes d, u, o, x and X, and
13160 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13161 * Python's regular ints.
13162 * Return value: a new PyUnicodeObject*, or NULL if error.
13163 * The output string is of the form
13164 * "-"? ("0x" | "0X")? digit+
13165 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13166 * set in flags. The case of hex digits will be correct,
13167 * There will be at least prec digits, zero-filled on the left if
13168 * necessary to get that many.
13169 * val object to be converted
13170 * flags bitmask of format flags; only F_ALT is looked at
13171 * prec minimum number of digits; 0-fill on left if needed
13172 * type a character in [duoxX]; u acts the same as d
13173 *
13174 * CAUTION: o, x and X conversions on regular ints can never
13175 * produce a '-' sign, but can for Python's unbounded ints.
13176 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013177static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013178formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013179{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013180 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013181 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013182 Py_ssize_t i;
13183 int sign; /* 1 if '-', else 0 */
13184 int len; /* number of characters */
13185 Py_ssize_t llen;
13186 int numdigits; /* len == numnondigits + numdigits */
13187 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013188 int prec = arg->prec;
13189 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013190
Victor Stinnerd0880d52012-04-27 23:40:13 +020013191 /* Avoid exceeding SSIZE_T_MAX */
13192 if (prec > INT_MAX-3) {
13193 PyErr_SetString(PyExc_OverflowError,
13194 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013195 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013196 }
13197
13198 assert(PyLong_Check(val));
13199
13200 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013201 default:
13202 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013203 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013204 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013205 case 'u':
13206 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013207 if (PyBool_Check(val))
13208 result = PyNumber_ToBase(val, 10);
13209 else
13210 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013211 break;
13212 case 'o':
13213 numnondigits = 2;
13214 result = PyNumber_ToBase(val, 8);
13215 break;
13216 case 'x':
13217 case 'X':
13218 numnondigits = 2;
13219 result = PyNumber_ToBase(val, 16);
13220 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013221 }
13222 if (!result)
13223 return NULL;
13224
13225 assert(unicode_modifiable(result));
13226 assert(PyUnicode_IS_READY(result));
13227 assert(PyUnicode_IS_ASCII(result));
13228
13229 /* To modify the string in-place, there can only be one reference. */
13230 if (Py_REFCNT(result) != 1) {
13231 PyErr_BadInternalCall();
13232 return NULL;
13233 }
13234 buf = PyUnicode_DATA(result);
13235 llen = PyUnicode_GET_LENGTH(result);
13236 if (llen > INT_MAX) {
13237 PyErr_SetString(PyExc_ValueError,
13238 "string too large in _PyBytes_FormatLong");
13239 return NULL;
13240 }
13241 len = (int)llen;
13242 sign = buf[0] == '-';
13243 numnondigits += sign;
13244 numdigits = len - numnondigits;
13245 assert(numdigits > 0);
13246
13247 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013248 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013249 (type == 'o' || type == 'x' || type == 'X'))) {
13250 assert(buf[sign] == '0');
13251 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13252 buf[sign+1] == 'o');
13253 numnondigits -= 2;
13254 buf += 2;
13255 len -= 2;
13256 if (sign)
13257 buf[0] = '-';
13258 assert(len == numnondigits + numdigits);
13259 assert(numdigits > 0);
13260 }
13261
13262 /* Fill with leading zeroes to meet minimum width. */
13263 if (prec > numdigits) {
13264 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13265 numnondigits + prec);
13266 char *b1;
13267 if (!r1) {
13268 Py_DECREF(result);
13269 return NULL;
13270 }
13271 b1 = PyBytes_AS_STRING(r1);
13272 for (i = 0; i < numnondigits; ++i)
13273 *b1++ = *buf++;
13274 for (i = 0; i < prec - numdigits; i++)
13275 *b1++ = '0';
13276 for (i = 0; i < numdigits; i++)
13277 *b1++ = *buf++;
13278 *b1 = '\0';
13279 Py_DECREF(result);
13280 result = r1;
13281 buf = PyBytes_AS_STRING(result);
13282 len = numnondigits + prec;
13283 }
13284
13285 /* Fix up case for hex conversions. */
13286 if (type == 'X') {
13287 /* Need to convert all lower case letters to upper case.
13288 and need to convert 0x to 0X (and -0x to -0X). */
13289 for (i = 0; i < len; i++)
13290 if (buf[i] >= 'a' && buf[i] <= 'x')
13291 buf[i] -= 'a'-'A';
13292 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013293 if (!PyUnicode_Check(result)
13294 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013295 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013296 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013297 Py_DECREF(result);
13298 result = unicode;
13299 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013300 else if (len != PyUnicode_GET_LENGTH(result)) {
13301 if (PyUnicode_Resize(&result, len) < 0)
13302 Py_CLEAR(result);
13303 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013304 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013305}
13306
Victor Stinner621ef3d2012-10-02 00:33:47 +020013307/* Format an integer.
13308 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013309 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013310 * -1 and raise an exception on error */
13311static int
Victor Stinnera47082312012-10-04 02:19:54 +020013312mainformatlong(PyObject *v,
13313 struct unicode_format_arg_t *arg,
13314 PyObject **p_output,
13315 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013316{
13317 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013318 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013319
13320 if (!PyNumber_Check(v))
13321 goto wrongtype;
13322
13323 if (!PyLong_Check(v)) {
13324 iobj = PyNumber_Long(v);
13325 if (iobj == NULL) {
13326 if (PyErr_ExceptionMatches(PyExc_TypeError))
13327 goto wrongtype;
13328 return -1;
13329 }
13330 assert(PyLong_Check(iobj));
13331 }
13332 else {
13333 iobj = v;
13334 Py_INCREF(iobj);
13335 }
13336
13337 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013338 && arg->width == -1 && arg->prec == -1
13339 && !(arg->flags & (F_SIGN | F_BLANK))
13340 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013341 {
13342 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013343 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013344 int base;
13345
Victor Stinnera47082312012-10-04 02:19:54 +020013346 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013347 {
13348 default:
13349 assert(0 && "'type' not in [diuoxX]");
13350 case 'd':
13351 case 'i':
13352 case 'u':
13353 base = 10;
13354 break;
13355 case 'o':
13356 base = 8;
13357 break;
13358 case 'x':
13359 case 'X':
13360 base = 16;
13361 break;
13362 }
13363
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013364 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13365 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013366 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013367 }
13368 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013369 return 1;
13370 }
13371
Victor Stinnera47082312012-10-04 02:19:54 +020013372 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013373 Py_DECREF(iobj);
13374 if (res == NULL)
13375 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013376 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013377 return 0;
13378
13379wrongtype:
13380 PyErr_Format(PyExc_TypeError,
13381 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013382 "not %.200s",
13383 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013384 return -1;
13385}
13386
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013387static Py_UCS4
13388formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013389{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013390 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013391 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013392 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013393 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013394 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 goto onError;
13396 }
13397 else {
13398 /* Integer input truncated to a character */
13399 long x;
13400 x = PyLong_AsLong(v);
13401 if (x == -1 && PyErr_Occurred())
13402 goto onError;
13403
Victor Stinner8faf8212011-12-08 22:14:11 +010013404 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013405 PyErr_SetString(PyExc_OverflowError,
13406 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013407 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 }
13409
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013410 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013411 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013412
Benjamin Peterson29060642009-01-31 22:14:21 +000013413 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013414 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013415 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013416 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013417}
13418
Victor Stinnera47082312012-10-04 02:19:54 +020013419/* Parse options of an argument: flags, width, precision.
13420 Handle also "%(name)" syntax.
13421
13422 Return 0 if the argument has been formatted into arg->str.
13423 Return 1 if the argument has been written into ctx->writer,
13424 Raise an exception and return -1 on error. */
13425static int
13426unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13427 struct unicode_format_arg_t *arg)
13428{
13429#define FORMAT_READ(ctx) \
13430 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13431
13432 PyObject *v;
13433
13434 arg->ch = FORMAT_READ(ctx);
13435 if (arg->ch == '(') {
13436 /* Get argument value from a dictionary. Example: "%(name)s". */
13437 Py_ssize_t keystart;
13438 Py_ssize_t keylen;
13439 PyObject *key;
13440 int pcount = 1;
13441
13442 if (ctx->dict == NULL) {
13443 PyErr_SetString(PyExc_TypeError,
13444 "format requires a mapping");
13445 return -1;
13446 }
13447 ++ctx->fmtpos;
13448 --ctx->fmtcnt;
13449 keystart = ctx->fmtpos;
13450 /* Skip over balanced parentheses */
13451 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13452 arg->ch = FORMAT_READ(ctx);
13453 if (arg->ch == ')')
13454 --pcount;
13455 else if (arg->ch == '(')
13456 ++pcount;
13457 ctx->fmtpos++;
13458 }
13459 keylen = ctx->fmtpos - keystart - 1;
13460 if (ctx->fmtcnt < 0 || pcount > 0) {
13461 PyErr_SetString(PyExc_ValueError,
13462 "incomplete format key");
13463 return -1;
13464 }
13465 key = PyUnicode_Substring(ctx->fmtstr,
13466 keystart, keystart + keylen);
13467 if (key == NULL)
13468 return -1;
13469 if (ctx->args_owned) {
13470 Py_DECREF(ctx->args);
13471 ctx->args_owned = 0;
13472 }
13473 ctx->args = PyObject_GetItem(ctx->dict, key);
13474 Py_DECREF(key);
13475 if (ctx->args == NULL)
13476 return -1;
13477 ctx->args_owned = 1;
13478 ctx->arglen = -1;
13479 ctx->argidx = -2;
13480 }
13481
13482 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
13483 arg->flags = 0;
13484 while (--ctx->fmtcnt >= 0) {
13485 arg->ch = FORMAT_READ(ctx);
13486 ctx->fmtpos++;
13487 switch (arg->ch) {
13488 case '-': arg->flags |= F_LJUST; continue;
13489 case '+': arg->flags |= F_SIGN; continue;
13490 case ' ': arg->flags |= F_BLANK; continue;
13491 case '#': arg->flags |= F_ALT; continue;
13492 case '0': arg->flags |= F_ZERO; continue;
13493 }
13494 break;
13495 }
13496
13497 /* Parse width. Example: "%10s" => width=10 */
13498 arg->width = -1;
13499 if (arg->ch == '*') {
13500 v = unicode_format_getnextarg(ctx);
13501 if (v == NULL)
13502 return -1;
13503 if (!PyLong_Check(v)) {
13504 PyErr_SetString(PyExc_TypeError,
13505 "* wants int");
13506 return -1;
13507 }
13508 arg->width = PyLong_AsLong(v);
13509 if (arg->width == -1 && PyErr_Occurred())
13510 return -1;
13511 if (arg->width < 0) {
13512 arg->flags |= F_LJUST;
13513 arg->width = -arg->width;
13514 }
13515 if (--ctx->fmtcnt >= 0) {
13516 arg->ch = FORMAT_READ(ctx);
13517 ctx->fmtpos++;
13518 }
13519 }
13520 else if (arg->ch >= '0' && arg->ch <= '9') {
13521 arg->width = arg->ch - '0';
13522 while (--ctx->fmtcnt >= 0) {
13523 arg->ch = FORMAT_READ(ctx);
13524 ctx->fmtpos++;
13525 if (arg->ch < '0' || arg->ch > '9')
13526 break;
13527 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13528 mixing signed and unsigned comparison. Since arg->ch is between
13529 '0' and '9', casting to int is safe. */
13530 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13531 PyErr_SetString(PyExc_ValueError,
13532 "width too big");
13533 return -1;
13534 }
13535 arg->width = arg->width*10 + (arg->ch - '0');
13536 }
13537 }
13538
13539 /* Parse precision. Example: "%.3f" => prec=3 */
13540 arg->prec = -1;
13541 if (arg->ch == '.') {
13542 arg->prec = 0;
13543 if (--ctx->fmtcnt >= 0) {
13544 arg->ch = FORMAT_READ(ctx);
13545 ctx->fmtpos++;
13546 }
13547 if (arg->ch == '*') {
13548 v = unicode_format_getnextarg(ctx);
13549 if (v == NULL)
13550 return -1;
13551 if (!PyLong_Check(v)) {
13552 PyErr_SetString(PyExc_TypeError,
13553 "* wants int");
13554 return -1;
13555 }
13556 arg->prec = PyLong_AsLong(v);
13557 if (arg->prec == -1 && PyErr_Occurred())
13558 return -1;
13559 if (arg->prec < 0)
13560 arg->prec = 0;
13561 if (--ctx->fmtcnt >= 0) {
13562 arg->ch = FORMAT_READ(ctx);
13563 ctx->fmtpos++;
13564 }
13565 }
13566 else if (arg->ch >= '0' && arg->ch <= '9') {
13567 arg->prec = arg->ch - '0';
13568 while (--ctx->fmtcnt >= 0) {
13569 arg->ch = FORMAT_READ(ctx);
13570 ctx->fmtpos++;
13571 if (arg->ch < '0' || arg->ch > '9')
13572 break;
13573 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13574 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013575 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013576 return -1;
13577 }
13578 arg->prec = arg->prec*10 + (arg->ch - '0');
13579 }
13580 }
13581 }
13582
13583 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13584 if (ctx->fmtcnt >= 0) {
13585 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13586 if (--ctx->fmtcnt >= 0) {
13587 arg->ch = FORMAT_READ(ctx);
13588 ctx->fmtpos++;
13589 }
13590 }
13591 }
13592 if (ctx->fmtcnt < 0) {
13593 PyErr_SetString(PyExc_ValueError,
13594 "incomplete format");
13595 return -1;
13596 }
13597 return 0;
13598
13599#undef FORMAT_READ
13600}
13601
13602/* Format one argument. Supported conversion specifiers:
13603
13604 - "s", "r", "a": any type
13605 - "i", "d", "u", "o", "x", "X": int
13606 - "e", "E", "f", "F", "g", "G": float
13607 - "c": int or str (1 character)
13608
13609 Return 0 if the argument has been formatted into *p_str,
13610 1 if the argument has been written into ctx->writer,
13611 -1 on error. */
13612static int
13613unicode_format_arg_format(struct unicode_formatter_t *ctx,
13614 struct unicode_format_arg_t *arg,
13615 PyObject **p_str)
13616{
13617 PyObject *v;
13618 _PyUnicodeWriter *writer = &ctx->writer;
13619
13620 if (ctx->fmtcnt == 0)
13621 ctx->writer.overallocate = 0;
13622
13623 if (arg->ch == '%') {
13624 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13625 return -1;
13626 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13627 writer->pos += 1;
13628 return 1;
13629 }
13630
13631 v = unicode_format_getnextarg(ctx);
13632 if (v == NULL)
13633 return -1;
13634
13635 arg->sign = 0;
13636
13637 switch (arg->ch) {
13638
13639 case 's':
13640 case 'r':
13641 case 'a':
13642 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13643 /* Fast path */
13644 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13645 return -1;
13646 return 1;
13647 }
13648
13649 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13650 *p_str = v;
13651 Py_INCREF(*p_str);
13652 }
13653 else {
13654 if (arg->ch == 's')
13655 *p_str = PyObject_Str(v);
13656 else if (arg->ch == 'r')
13657 *p_str = PyObject_Repr(v);
13658 else
13659 *p_str = PyObject_ASCII(v);
13660 }
13661 break;
13662
13663 case 'i':
13664 case 'd':
13665 case 'u':
13666 case 'o':
13667 case 'x':
13668 case 'X':
13669 {
13670 int ret = mainformatlong(v, arg, p_str, writer);
13671 if (ret != 0)
13672 return ret;
13673 arg->sign = 1;
13674 break;
13675 }
13676
13677 case 'e':
13678 case 'E':
13679 case 'f':
13680 case 'F':
13681 case 'g':
13682 case 'G':
13683 if (arg->width == -1 && arg->prec == -1
13684 && !(arg->flags & (F_SIGN | F_BLANK)))
13685 {
13686 /* Fast path */
13687 if (formatfloat(v, arg, NULL, writer) == -1)
13688 return -1;
13689 return 1;
13690 }
13691
13692 arg->sign = 1;
13693 if (formatfloat(v, arg, p_str, NULL) == -1)
13694 return -1;
13695 break;
13696
13697 case 'c':
13698 {
13699 Py_UCS4 ch = formatchar(v);
13700 if (ch == (Py_UCS4) -1)
13701 return -1;
13702 if (arg->width == -1 && arg->prec == -1) {
13703 /* Fast path */
13704 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13705 return -1;
13706 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13707 writer->pos += 1;
13708 return 1;
13709 }
13710 *p_str = PyUnicode_FromOrdinal(ch);
13711 break;
13712 }
13713
13714 default:
13715 PyErr_Format(PyExc_ValueError,
13716 "unsupported format character '%c' (0x%x) "
13717 "at index %zd",
13718 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13719 (int)arg->ch,
13720 ctx->fmtpos - 1);
13721 return -1;
13722 }
13723 if (*p_str == NULL)
13724 return -1;
13725 assert (PyUnicode_Check(*p_str));
13726 return 0;
13727}
13728
13729static int
13730unicode_format_arg_output(struct unicode_formatter_t *ctx,
13731 struct unicode_format_arg_t *arg,
13732 PyObject *str)
13733{
13734 Py_ssize_t len;
13735 enum PyUnicode_Kind kind;
13736 void *pbuf;
13737 Py_ssize_t pindex;
13738 Py_UCS4 signchar;
13739 Py_ssize_t buflen;
13740 Py_UCS4 maxchar, bufmaxchar;
13741 Py_ssize_t sublen;
13742 _PyUnicodeWriter *writer = &ctx->writer;
13743 Py_UCS4 fill;
13744
13745 fill = ' ';
13746 if (arg->sign && arg->flags & F_ZERO)
13747 fill = '0';
13748
13749 if (PyUnicode_READY(str) == -1)
13750 return -1;
13751
13752 len = PyUnicode_GET_LENGTH(str);
13753 if ((arg->width == -1 || arg->width <= len)
13754 && (arg->prec == -1 || arg->prec >= len)
13755 && !(arg->flags & (F_SIGN | F_BLANK)))
13756 {
13757 /* Fast path */
13758 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13759 return -1;
13760 return 0;
13761 }
13762
13763 /* Truncate the string for "s", "r" and "a" formats
13764 if the precision is set */
13765 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13766 if (arg->prec >= 0 && len > arg->prec)
13767 len = arg->prec;
13768 }
13769
13770 /* Adjust sign and width */
13771 kind = PyUnicode_KIND(str);
13772 pbuf = PyUnicode_DATA(str);
13773 pindex = 0;
13774 signchar = '\0';
13775 if (arg->sign) {
13776 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13777 if (ch == '-' || ch == '+') {
13778 signchar = ch;
13779 len--;
13780 pindex++;
13781 }
13782 else if (arg->flags & F_SIGN)
13783 signchar = '+';
13784 else if (arg->flags & F_BLANK)
13785 signchar = ' ';
13786 else
13787 arg->sign = 0;
13788 }
13789 if (arg->width < len)
13790 arg->width = len;
13791
13792 /* Prepare the writer */
13793 bufmaxchar = 127;
13794 if (!(arg->flags & F_LJUST)) {
13795 if (arg->sign) {
13796 if ((arg->width-1) > len)
13797 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13798 }
13799 else {
13800 if (arg->width > len)
13801 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13802 }
13803 }
13804 maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13805 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13806 buflen = arg->width;
13807 if (arg->sign && len == arg->width)
13808 buflen++;
13809 if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1)
13810 return -1;
13811
13812 /* Write the sign if needed */
13813 if (arg->sign) {
13814 if (fill != ' ') {
13815 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13816 writer->pos += 1;
13817 }
13818 if (arg->width > len)
13819 arg->width--;
13820 }
13821
13822 /* Write the numeric prefix for "x", "X" and "o" formats
13823 if the alternate form is used.
13824 For example, write "0x" for the "%#x" format. */
13825 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13826 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13827 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
13828 if (fill != ' ') {
13829 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13830 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13831 writer->pos += 2;
13832 pindex += 2;
13833 }
13834 arg->width -= 2;
13835 if (arg->width < 0)
13836 arg->width = 0;
13837 len -= 2;
13838 }
13839
13840 /* Pad left with the fill character if needed */
13841 if (arg->width > len && !(arg->flags & F_LJUST)) {
13842 sublen = arg->width - len;
13843 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
13844 writer->pos += sublen;
13845 arg->width = len;
13846 }
13847
13848 /* If padding with spaces: write sign if needed and/or numeric prefix if
13849 the alternate form is used */
13850 if (fill == ' ') {
13851 if (arg->sign) {
13852 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13853 writer->pos += 1;
13854 }
13855 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13856 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13857 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
13858 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13859 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13860 writer->pos += 2;
13861 pindex += 2;
13862 }
13863 }
13864
13865 /* Write characters */
13866 if (len) {
13867 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13868 str, pindex, len);
13869 writer->pos += len;
13870 }
13871
13872 /* Pad right with the fill character if needed */
13873 if (arg->width > len) {
13874 sublen = arg->width - len;
13875 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
13876 writer->pos += sublen;
13877 }
13878 return 0;
13879}
13880
13881/* Helper of PyUnicode_Format(): format one arg.
13882 Return 0 on success, raise an exception and return -1 on error. */
13883static int
13884unicode_format_arg(struct unicode_formatter_t *ctx)
13885{
13886 struct unicode_format_arg_t arg;
13887 PyObject *str;
13888 int ret;
13889
13890 ret = unicode_format_arg_parse(ctx, &arg);
13891 if (ret == -1)
13892 return -1;
13893
13894 ret = unicode_format_arg_format(ctx, &arg, &str);
13895 if (ret == -1)
13896 return -1;
13897
13898 if (ret != 1) {
13899 ret = unicode_format_arg_output(ctx, &arg, str);
13900 Py_DECREF(str);
13901 if (ret == -1)
13902 return -1;
13903 }
13904
13905 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
13906 PyErr_SetString(PyExc_TypeError,
13907 "not all arguments converted during string formatting");
13908 return -1;
13909 }
13910 return 0;
13911}
13912
Alexander Belopolsky40018472011-02-26 01:02:56 +000013913PyObject *
13914PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013915{
Victor Stinnera47082312012-10-04 02:19:54 +020013916 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000013917
Guido van Rossumd57fd912000-03-10 22:53:23 +000013918 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013919 PyErr_BadInternalCall();
13920 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013921 }
Victor Stinnera47082312012-10-04 02:19:54 +020013922
13923 ctx.fmtstr = PyUnicode_FromObject(format);
13924 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013925 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020013926 if (PyUnicode_READY(ctx.fmtstr) == -1) {
13927 Py_DECREF(ctx.fmtstr);
13928 return NULL;
13929 }
13930 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
13931 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
13932 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
13933 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013934
Victor Stinnera47082312012-10-04 02:19:54 +020013935 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013936
Guido van Rossumd57fd912000-03-10 22:53:23 +000013937 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020013938 ctx.arglen = PyTuple_Size(args);
13939 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013940 }
13941 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013942 ctx.arglen = -1;
13943 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013944 }
Victor Stinnera47082312012-10-04 02:19:54 +020013945 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013946 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020013947 ctx.dict = args;
13948 else
13949 ctx.dict = NULL;
13950 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013951
Victor Stinnera47082312012-10-04 02:19:54 +020013952 while (--ctx.fmtcnt >= 0) {
13953 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13954 Py_ssize_t nonfmtpos, sublen;
13955 Py_UCS4 maxchar;
13956
13957 nonfmtpos = ctx.fmtpos++;
13958 while (ctx.fmtcnt >= 0 &&
13959 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13960 ctx.fmtpos++;
13961 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013962 }
Victor Stinnera47082312012-10-04 02:19:54 +020013963 if (ctx.fmtcnt < 0) {
13964 ctx.fmtpos--;
13965 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020013966 }
Victor Stinnera47082312012-10-04 02:19:54 +020013967 sublen = ctx.fmtpos - nonfmtpos;
13968 maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr,
Victor Stinneree4544c2012-05-09 22:24:08 +020013969 nonfmtpos, nonfmtpos + sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020013970 if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013971 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013972
Victor Stinnera47082312012-10-04 02:19:54 +020013973 _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos,
13974 ctx.fmtstr, nonfmtpos, sublen);
13975 ctx.writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013976 }
13977 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013978 ctx.fmtpos++;
13979 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013980 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020013981 }
13982 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013983
Victor Stinnera47082312012-10-04 02:19:54 +020013984 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013985 PyErr_SetString(PyExc_TypeError,
13986 "not all arguments converted during string formatting");
13987 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013988 }
13989
Victor Stinnera47082312012-10-04 02:19:54 +020013990 if (ctx.args_owned) {
13991 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013992 }
Victor Stinnera47082312012-10-04 02:19:54 +020013993 Py_DECREF(ctx.fmtstr);
13994 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013995
Benjamin Peterson29060642009-01-31 22:14:21 +000013996 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020013997 Py_DECREF(ctx.fmtstr);
13998 _PyUnicodeWriter_Dealloc(&ctx.writer);
13999 if (ctx.args_owned) {
14000 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014001 }
14002 return NULL;
14003}
14004
Jeremy Hylton938ace62002-07-17 16:30:39 +000014005static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014006unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14007
Tim Peters6d6c1a32001-08-02 04:15:00 +000014008static PyObject *
14009unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14010{
Benjamin Peterson29060642009-01-31 22:14:21 +000014011 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014012 static char *kwlist[] = {"object", "encoding", "errors", 0};
14013 char *encoding = NULL;
14014 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014015
Benjamin Peterson14339b62009-01-31 16:36:08 +000014016 if (type != &PyUnicode_Type)
14017 return unicode_subtype_new(type, args, kwds);
14018 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014019 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014020 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014021 if (x == NULL) {
14022 Py_INCREF(unicode_empty);
14023 return unicode_empty;
14024 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014025 if (encoding == NULL && errors == NULL)
14026 return PyObject_Str(x);
14027 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014028 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014029}
14030
Guido van Rossume023fe02001-08-30 03:12:59 +000014031static PyObject *
14032unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14033{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014034 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014035 Py_ssize_t length, char_size;
14036 int share_wstr, share_utf8;
14037 unsigned int kind;
14038 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014039
Benjamin Peterson14339b62009-01-31 16:36:08 +000014040 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014041
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014042 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014043 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014044 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014045 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014046 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014047 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014048 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014049 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014050
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014051 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014052 if (self == NULL) {
14053 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014054 return NULL;
14055 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014056 kind = PyUnicode_KIND(unicode);
14057 length = PyUnicode_GET_LENGTH(unicode);
14058
14059 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014060#ifdef Py_DEBUG
14061 _PyUnicode_HASH(self) = -1;
14062#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014063 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014064#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014065 _PyUnicode_STATE(self).interned = 0;
14066 _PyUnicode_STATE(self).kind = kind;
14067 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014068 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014069 _PyUnicode_STATE(self).ready = 1;
14070 _PyUnicode_WSTR(self) = NULL;
14071 _PyUnicode_UTF8_LENGTH(self) = 0;
14072 _PyUnicode_UTF8(self) = NULL;
14073 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014074 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014075
14076 share_utf8 = 0;
14077 share_wstr = 0;
14078 if (kind == PyUnicode_1BYTE_KIND) {
14079 char_size = 1;
14080 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14081 share_utf8 = 1;
14082 }
14083 else if (kind == PyUnicode_2BYTE_KIND) {
14084 char_size = 2;
14085 if (sizeof(wchar_t) == 2)
14086 share_wstr = 1;
14087 }
14088 else {
14089 assert(kind == PyUnicode_4BYTE_KIND);
14090 char_size = 4;
14091 if (sizeof(wchar_t) == 4)
14092 share_wstr = 1;
14093 }
14094
14095 /* Ensure we won't overflow the length. */
14096 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14097 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014098 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014099 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014100 data = PyObject_MALLOC((length + 1) * char_size);
14101 if (data == NULL) {
14102 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014103 goto onError;
14104 }
14105
Victor Stinnerc3c74152011-10-02 20:39:55 +020014106 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014107 if (share_utf8) {
14108 _PyUnicode_UTF8_LENGTH(self) = length;
14109 _PyUnicode_UTF8(self) = data;
14110 }
14111 if (share_wstr) {
14112 _PyUnicode_WSTR_LENGTH(self) = length;
14113 _PyUnicode_WSTR(self) = (wchar_t *)data;
14114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014115
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014116 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014117 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014118 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014119#ifdef Py_DEBUG
14120 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14121#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014122 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014123 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014124
14125onError:
14126 Py_DECREF(unicode);
14127 Py_DECREF(self);
14128 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014129}
14130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014131PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014132"str(object='') -> str\n\
14133str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014134\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014135Create a new string object from the given object. If encoding or\n\
14136errors is specified, then the object must expose a data buffer\n\
14137that will be decoded using the given encoding and error handler.\n\
14138Otherwise, returns the result of object.__str__() (if defined)\n\
14139or repr(object).\n\
14140encoding defaults to sys.getdefaultencoding().\n\
14141errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014142
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014143static PyObject *unicode_iter(PyObject *seq);
14144
Guido van Rossumd57fd912000-03-10 22:53:23 +000014145PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014146 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014147 "str", /* tp_name */
14148 sizeof(PyUnicodeObject), /* tp_size */
14149 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014150 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014151 (destructor)unicode_dealloc, /* tp_dealloc */
14152 0, /* tp_print */
14153 0, /* tp_getattr */
14154 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014155 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014156 unicode_repr, /* tp_repr */
14157 &unicode_as_number, /* tp_as_number */
14158 &unicode_as_sequence, /* tp_as_sequence */
14159 &unicode_as_mapping, /* tp_as_mapping */
14160 (hashfunc) unicode_hash, /* tp_hash*/
14161 0, /* tp_call*/
14162 (reprfunc) unicode_str, /* tp_str */
14163 PyObject_GenericGetAttr, /* tp_getattro */
14164 0, /* tp_setattro */
14165 0, /* tp_as_buffer */
14166 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014167 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014168 unicode_doc, /* tp_doc */
14169 0, /* tp_traverse */
14170 0, /* tp_clear */
14171 PyUnicode_RichCompare, /* tp_richcompare */
14172 0, /* tp_weaklistoffset */
14173 unicode_iter, /* tp_iter */
14174 0, /* tp_iternext */
14175 unicode_methods, /* tp_methods */
14176 0, /* tp_members */
14177 0, /* tp_getset */
14178 &PyBaseObject_Type, /* tp_base */
14179 0, /* tp_dict */
14180 0, /* tp_descr_get */
14181 0, /* tp_descr_set */
14182 0, /* tp_dictoffset */
14183 0, /* tp_init */
14184 0, /* tp_alloc */
14185 unicode_new, /* tp_new */
14186 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014187};
14188
14189/* Initialize the Unicode implementation */
14190
Victor Stinner3a50e702011-10-18 21:21:00 +020014191int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014192{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014193 int i;
14194
Thomas Wouters477c8d52006-05-27 19:21:47 +000014195 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014196 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014197 0x000A, /* LINE FEED */
14198 0x000D, /* CARRIAGE RETURN */
14199 0x001C, /* FILE SEPARATOR */
14200 0x001D, /* GROUP SEPARATOR */
14201 0x001E, /* RECORD SEPARATOR */
14202 0x0085, /* NEXT LINE */
14203 0x2028, /* LINE SEPARATOR */
14204 0x2029, /* PARAGRAPH SEPARATOR */
14205 };
14206
Fred Drakee4315f52000-05-09 19:53:39 +000014207 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014208 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014209 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014210 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014211 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014213 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014214 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014215 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014216 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014217
14218 /* initialize the linebreak bloom filter */
14219 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014220 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014221 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014222
14223 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014224
14225#ifdef HAVE_MBCS
14226 winver.dwOSVersionInfoSize = sizeof(winver);
14227 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14228 PyErr_SetFromWindowsErr(0);
14229 return -1;
14230 }
14231#endif
14232 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014233}
14234
14235/* Finalize the Unicode implementation */
14236
Christian Heimesa156e092008-02-16 07:38:31 +000014237int
14238PyUnicode_ClearFreeList(void)
14239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014240 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014241}
14242
Guido van Rossumd57fd912000-03-10 22:53:23 +000014243void
Thomas Wouters78890102000-07-22 19:25:51 +000014244_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014245{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014246 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014247
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014248 Py_XDECREF(unicode_empty);
14249 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014250
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014251 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014252 if (unicode_latin1[i]) {
14253 Py_DECREF(unicode_latin1[i]);
14254 unicode_latin1[i] = NULL;
14255 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014256 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014257 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014258 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014259}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014260
Walter Dörwald16807132007-05-25 13:52:07 +000014261void
14262PyUnicode_InternInPlace(PyObject **p)
14263{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014264 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014265 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014266#ifdef Py_DEBUG
14267 assert(s != NULL);
14268 assert(_PyUnicode_CHECK(s));
14269#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014270 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014271 return;
14272#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014273 /* If it's a subclass, we don't really know what putting
14274 it in the interned dict might do. */
14275 if (!PyUnicode_CheckExact(s))
14276 return;
14277 if (PyUnicode_CHECK_INTERNED(s))
14278 return;
14279 if (interned == NULL) {
14280 interned = PyDict_New();
14281 if (interned == NULL) {
14282 PyErr_Clear(); /* Don't leave an exception */
14283 return;
14284 }
14285 }
14286 /* It might be that the GetItem call fails even
14287 though the key is present in the dictionary,
14288 namely when this happens during a stack overflow. */
14289 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014290 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014291 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014292
Benjamin Peterson29060642009-01-31 22:14:21 +000014293 if (t) {
14294 Py_INCREF(t);
14295 Py_DECREF(*p);
14296 *p = t;
14297 return;
14298 }
Walter Dörwald16807132007-05-25 13:52:07 +000014299
Benjamin Peterson14339b62009-01-31 16:36:08 +000014300 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014301 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014302 PyErr_Clear();
14303 PyThreadState_GET()->recursion_critical = 0;
14304 return;
14305 }
14306 PyThreadState_GET()->recursion_critical = 0;
14307 /* The two references in interned are not counted by refcnt.
14308 The deallocator will take care of this */
14309 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014310 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014311}
14312
14313void
14314PyUnicode_InternImmortal(PyObject **p)
14315{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014316 PyUnicode_InternInPlace(p);
14317 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014318 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014319 Py_INCREF(*p);
14320 }
Walter Dörwald16807132007-05-25 13:52:07 +000014321}
14322
14323PyObject *
14324PyUnicode_InternFromString(const char *cp)
14325{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014326 PyObject *s = PyUnicode_FromString(cp);
14327 if (s == NULL)
14328 return NULL;
14329 PyUnicode_InternInPlace(&s);
14330 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014331}
14332
Alexander Belopolsky40018472011-02-26 01:02:56 +000014333void
14334_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014335{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014336 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014337 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014338 Py_ssize_t i, n;
14339 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014340
Benjamin Peterson14339b62009-01-31 16:36:08 +000014341 if (interned == NULL || !PyDict_Check(interned))
14342 return;
14343 keys = PyDict_Keys(interned);
14344 if (keys == NULL || !PyList_Check(keys)) {
14345 PyErr_Clear();
14346 return;
14347 }
Walter Dörwald16807132007-05-25 13:52:07 +000014348
Benjamin Peterson14339b62009-01-31 16:36:08 +000014349 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14350 detector, interned unicode strings are not forcibly deallocated;
14351 rather, we give them their stolen references back, and then clear
14352 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014353
Benjamin Peterson14339b62009-01-31 16:36:08 +000014354 n = PyList_GET_SIZE(keys);
14355 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014356 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014357 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014358 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014359 if (PyUnicode_READY(s) == -1) {
14360 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014361 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014363 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014364 case SSTATE_NOT_INTERNED:
14365 /* XXX Shouldn't happen */
14366 break;
14367 case SSTATE_INTERNED_IMMORTAL:
14368 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014369 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014370 break;
14371 case SSTATE_INTERNED_MORTAL:
14372 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014373 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014374 break;
14375 default:
14376 Py_FatalError("Inconsistent interned string state.");
14377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014378 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014379 }
14380 fprintf(stderr, "total size of all interned strings: "
14381 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14382 "mortal/immortal\n", mortal_size, immortal_size);
14383 Py_DECREF(keys);
14384 PyDict_Clear(interned);
14385 Py_DECREF(interned);
14386 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014387}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014388
14389
14390/********************* Unicode Iterator **************************/
14391
14392typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014393 PyObject_HEAD
14394 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014395 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014396} unicodeiterobject;
14397
14398static void
14399unicodeiter_dealloc(unicodeiterobject *it)
14400{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014401 _PyObject_GC_UNTRACK(it);
14402 Py_XDECREF(it->it_seq);
14403 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014404}
14405
14406static int
14407unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14408{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014409 Py_VISIT(it->it_seq);
14410 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014411}
14412
14413static PyObject *
14414unicodeiter_next(unicodeiterobject *it)
14415{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014416 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014417
Benjamin Peterson14339b62009-01-31 16:36:08 +000014418 assert(it != NULL);
14419 seq = it->it_seq;
14420 if (seq == NULL)
14421 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014422 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014424 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14425 int kind = PyUnicode_KIND(seq);
14426 void *data = PyUnicode_DATA(seq);
14427 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14428 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014429 if (item != NULL)
14430 ++it->it_index;
14431 return item;
14432 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014433
Benjamin Peterson14339b62009-01-31 16:36:08 +000014434 Py_DECREF(seq);
14435 it->it_seq = NULL;
14436 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014437}
14438
14439static PyObject *
14440unicodeiter_len(unicodeiterobject *it)
14441{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014442 Py_ssize_t len = 0;
14443 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014444 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014445 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014446}
14447
14448PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14449
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014450static PyObject *
14451unicodeiter_reduce(unicodeiterobject *it)
14452{
14453 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014454 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014455 it->it_seq, it->it_index);
14456 } else {
14457 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14458 if (u == NULL)
14459 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014460 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014461 }
14462}
14463
14464PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14465
14466static PyObject *
14467unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14468{
14469 Py_ssize_t index = PyLong_AsSsize_t(state);
14470 if (index == -1 && PyErr_Occurred())
14471 return NULL;
14472 if (index < 0)
14473 index = 0;
14474 it->it_index = index;
14475 Py_RETURN_NONE;
14476}
14477
14478PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14479
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014480static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014481 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014482 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014483 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14484 reduce_doc},
14485 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14486 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014487 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014488};
14489
14490PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014491 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14492 "str_iterator", /* tp_name */
14493 sizeof(unicodeiterobject), /* tp_basicsize */
14494 0, /* tp_itemsize */
14495 /* methods */
14496 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14497 0, /* tp_print */
14498 0, /* tp_getattr */
14499 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014500 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014501 0, /* tp_repr */
14502 0, /* tp_as_number */
14503 0, /* tp_as_sequence */
14504 0, /* tp_as_mapping */
14505 0, /* tp_hash */
14506 0, /* tp_call */
14507 0, /* tp_str */
14508 PyObject_GenericGetAttr, /* tp_getattro */
14509 0, /* tp_setattro */
14510 0, /* tp_as_buffer */
14511 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14512 0, /* tp_doc */
14513 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14514 0, /* tp_clear */
14515 0, /* tp_richcompare */
14516 0, /* tp_weaklistoffset */
14517 PyObject_SelfIter, /* tp_iter */
14518 (iternextfunc)unicodeiter_next, /* tp_iternext */
14519 unicodeiter_methods, /* tp_methods */
14520 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014521};
14522
14523static PyObject *
14524unicode_iter(PyObject *seq)
14525{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014526 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014527
Benjamin Peterson14339b62009-01-31 16:36:08 +000014528 if (!PyUnicode_Check(seq)) {
14529 PyErr_BadInternalCall();
14530 return NULL;
14531 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014532 if (PyUnicode_READY(seq) == -1)
14533 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014534 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14535 if (it == NULL)
14536 return NULL;
14537 it->it_index = 0;
14538 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014539 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014540 _PyObject_GC_TRACK(it);
14541 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014542}
14543
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014544
14545size_t
14546Py_UNICODE_strlen(const Py_UNICODE *u)
14547{
14548 int res = 0;
14549 while(*u++)
14550 res++;
14551 return res;
14552}
14553
14554Py_UNICODE*
14555Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14556{
14557 Py_UNICODE *u = s1;
14558 while ((*u++ = *s2++));
14559 return s1;
14560}
14561
14562Py_UNICODE*
14563Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14564{
14565 Py_UNICODE *u = s1;
14566 while ((*u++ = *s2++))
14567 if (n-- == 0)
14568 break;
14569 return s1;
14570}
14571
14572Py_UNICODE*
14573Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14574{
14575 Py_UNICODE *u1 = s1;
14576 u1 += Py_UNICODE_strlen(u1);
14577 Py_UNICODE_strcpy(u1, s2);
14578 return s1;
14579}
14580
14581int
14582Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14583{
14584 while (*s1 && *s2 && *s1 == *s2)
14585 s1++, s2++;
14586 if (*s1 && *s2)
14587 return (*s1 < *s2) ? -1 : +1;
14588 if (*s1)
14589 return 1;
14590 if (*s2)
14591 return -1;
14592 return 0;
14593}
14594
14595int
14596Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14597{
14598 register Py_UNICODE u1, u2;
14599 for (; n != 0; n--) {
14600 u1 = *s1;
14601 u2 = *s2;
14602 if (u1 != u2)
14603 return (u1 < u2) ? -1 : +1;
14604 if (u1 == '\0')
14605 return 0;
14606 s1++;
14607 s2++;
14608 }
14609 return 0;
14610}
14611
14612Py_UNICODE*
14613Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14614{
14615 const Py_UNICODE *p;
14616 for (p = s; *p; p++)
14617 if (*p == c)
14618 return (Py_UNICODE*)p;
14619 return NULL;
14620}
14621
14622Py_UNICODE*
14623Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14624{
14625 const Py_UNICODE *p;
14626 p = s + Py_UNICODE_strlen(s);
14627 while (p != s) {
14628 p--;
14629 if (*p == c)
14630 return (Py_UNICODE*)p;
14631 }
14632 return NULL;
14633}
Victor Stinner331ea922010-08-10 16:37:20 +000014634
Victor Stinner71133ff2010-09-01 23:43:53 +000014635Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014636PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014637{
Victor Stinner577db2c2011-10-11 22:12:48 +020014638 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014639 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014641 if (!PyUnicode_Check(unicode)) {
14642 PyErr_BadArgument();
14643 return NULL;
14644 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014645 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014646 if (u == NULL)
14647 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014648 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014649 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014650 PyErr_NoMemory();
14651 return NULL;
14652 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014653 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014654 size *= sizeof(Py_UNICODE);
14655 copy = PyMem_Malloc(size);
14656 if (copy == NULL) {
14657 PyErr_NoMemory();
14658 return NULL;
14659 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014660 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014661 return copy;
14662}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014663
Georg Brandl66c221e2010-10-14 07:04:07 +000014664/* A _string module, to export formatter_parser and formatter_field_name_split
14665 to the string.Formatter class implemented in Python. */
14666
14667static PyMethodDef _string_methods[] = {
14668 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14669 METH_O, PyDoc_STR("split the argument as a field name")},
14670 {"formatter_parser", (PyCFunction) formatter_parser,
14671 METH_O, PyDoc_STR("parse the argument as a format string")},
14672 {NULL, NULL}
14673};
14674
14675static struct PyModuleDef _string_module = {
14676 PyModuleDef_HEAD_INIT,
14677 "_string",
14678 PyDoc_STR("string helper module"),
14679 0,
14680 _string_methods,
14681 NULL,
14682 NULL,
14683 NULL,
14684 NULL
14685};
14686
14687PyMODINIT_FUNC
14688PyInit__string(void)
14689{
14690 return PyModule_Create(&_string_module);
14691}
14692
14693
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014694#ifdef __cplusplus
14695}
14696#endif