blob: 6491fdc3169b59e934884c3a74d61533d5b74df8 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
52 The globals are initialized by the _PyUnicode_Init() API and should
53 not be used before calling that API.
54
55*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000057
58#ifdef __cplusplus
59extern "C" {
60#endif
61
Victor Stinner8faf8212011-12-08 22:14:11 +010062/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
63#define MAX_UNICODE 0x10ffff
64
Victor Stinner910337b2011-10-03 03:20:16 +020065#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020066# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020067#else
68# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
69#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020070
Victor Stinnere90fe6a2011-10-01 16:48:13 +020071#define _PyUnicode_UTF8(op) \
72 (((PyCompactUnicodeObject*)(op))->utf8)
73#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020074 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020075 assert(PyUnicode_IS_READY(op)), \
76 PyUnicode_IS_COMPACT_ASCII(op) ? \
77 ((char*)((PyASCIIObject*)(op) + 1)) : \
78 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020079#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080 (((PyCompactUnicodeObject*)(op))->utf8_length)
81#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((PyASCIIObject*)(op))->length : \
86 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020087#define _PyUnicode_WSTR(op) \
88 (((PyASCIIObject*)(op))->wstr)
89#define _PyUnicode_WSTR_LENGTH(op) \
90 (((PyCompactUnicodeObject*)(op))->wstr_length)
91#define _PyUnicode_LENGTH(op) \
92 (((PyASCIIObject *)(op))->length)
93#define _PyUnicode_STATE(op) \
94 (((PyASCIIObject *)(op))->state)
95#define _PyUnicode_HASH(op) \
96 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020097#define _PyUnicode_KIND(op) \
98 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020099 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#define _PyUnicode_GET_LENGTH(op) \
101 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200102 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200103#define _PyUnicode_DATA_ANY(op) \
104 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105
Victor Stinnere6abb482012-05-02 01:15:40 +0200106/* Optimized version of Py_MAX() to compute the maximum character:
107 use it when your are computing the second argument of PyUnicode_New() */
108#define MAX_MAXCHAR(maxchar1, maxchar2) \
109 ((maxchar1) | (maxchar2))
110
Victor Stinner910337b2011-10-03 03:20:16 +0200111#undef PyUnicode_READY
112#define PyUnicode_READY(op) \
113 (assert(_PyUnicode_CHECK(op)), \
114 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200115 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100116 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200117
Victor Stinnerc379ead2011-10-03 12:52:27 +0200118#define _PyUnicode_SHARE_UTF8(op) \
119 (assert(_PyUnicode_CHECK(op)), \
120 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
121 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
122#define _PyUnicode_SHARE_WSTR(op) \
123 (assert(_PyUnicode_CHECK(op)), \
124 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
125
Victor Stinner829c0ad2011-10-03 01:08:02 +0200126/* true if the Unicode object has an allocated UTF-8 memory block
127 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_HAS_UTF8_MEMORY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (!PyUnicode_IS_COMPACT_ASCII(op) \
131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (_PyUnicode_WSTR(op) && \
139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200149 to_type *_to = (to_type *) to; \
150 const from_type *_iter = (begin); \
151 const from_type *_end = (end); \
152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Walter Dörwald16807132007-05-25 13:52:07 +0000166/* This dictionary holds all interned unicode strings. Note that references
167 to strings in this dictionary are *not* counted in the string's ob_refcnt.
168 When the interned string reaches a refcnt of 0 the string deallocation
169 function will delete the reference from this dictionary.
170
171 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000172 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000173*/
174static PyObject *interned;
175
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000176/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200177static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200179/* List of static strings. */
180static _Py_Identifier *static_strings;
181
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000182/* Single character Unicode strings in the Latin-1 range are being
183 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200184static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185
Christian Heimes190d79e2008-01-30 11:58:22 +0000186/* Fast detection of the most frequent whitespace characters */
187const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000188 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000190/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000C: * FORM FEED */
193/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 1, 1, 1, 1, 1, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* case 0x001C: * FILE SEPARATOR */
197/* case 0x001D: * GROUP SEPARATOR */
198/* case 0x001E: * RECORD SEPARATOR */
199/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 1, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000206
Benjamin Peterson14339b62009-01-31 16:36:08 +0000207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000215};
216
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200217/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200218static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100220static int unicode_modifiable(PyObject *unicode);
221
Victor Stinnerfe226c02011-10-03 03:52:20 +0200222
Alexander Belopolsky40018472011-02-26 01:02:56 +0000223static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200224_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
225static PyObject *
226_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
227static PyObject *
228_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
229
230static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000232 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100233 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000234 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
235
Alexander Belopolsky40018472011-02-26 01:02:56 +0000236static void
237raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300238 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100239 PyObject *unicode,
240 Py_ssize_t startpos, Py_ssize_t endpos,
241 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000242
Christian Heimes190d79e2008-01-30 11:58:22 +0000243/* Same for linebreaks */
244static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000247/* 0x000B, * LINE TABULATION */
248/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000249/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000250 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000252/* 0x001C, * FILE SEPARATOR */
253/* 0x001D, * GROUP SEPARATOR */
254/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000255 0, 0, 0, 0, 1, 1, 1, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000260
Benjamin Peterson14339b62009-01-31 16:36:08 +0000261 0, 0, 0, 0, 0, 0, 0, 0,
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000269};
270
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300271/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
272 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000273Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000274PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000275{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000276#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000278#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 /* This is actually an illegal character, so it should
280 not be passed to unichr. */
281 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282#endif
283}
284
Victor Stinner910337b2011-10-03 03:20:16 +0200285#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200286int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100287_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200288{
289 PyASCIIObject *ascii;
290 unsigned int kind;
291
292 assert(PyUnicode_Check(op));
293
294 ascii = (PyASCIIObject *)op;
295 kind = ascii->state.kind;
296
Victor Stinnera3b334d2011-10-03 13:53:37 +0200297 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200298 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200299 assert(ascii->state.ready == 1);
300 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200301 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200302 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200303 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200304
Victor Stinnera41463c2011-10-04 01:05:08 +0200305 if (ascii->state.compact == 1) {
306 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND
308 || kind == PyUnicode_2BYTE_KIND
309 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200311 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200312 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100313 }
314 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
316
317 data = unicode->data.any;
318 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100319 assert(ascii->length == 0);
320 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert(ascii->state.compact == 0);
322 assert(ascii->state.ascii == 0);
323 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100324 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 assert(ascii->wstr != NULL);
326 assert(data == NULL);
327 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 }
329 else {
330 assert(kind == PyUnicode_1BYTE_KIND
331 || kind == PyUnicode_2BYTE_KIND
332 || kind == PyUnicode_4BYTE_KIND);
333 assert(ascii->state.compact == 0);
334 assert(ascii->state.ready == 1);
335 assert(data != NULL);
336 if (ascii->state.ascii) {
337 assert (compact->utf8 == data);
338 assert (compact->utf8_length == ascii->length);
339 }
340 else
341 assert (compact->utf8 != data);
342 }
343 }
344 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 if (
346#if SIZEOF_WCHAR_T == 2
347 kind == PyUnicode_2BYTE_KIND
348#else
349 kind == PyUnicode_4BYTE_KIND
350#endif
351 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 {
353 assert(ascii->wstr == data);
354 assert(compact->wstr_length == ascii->length);
355 } else
356 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200357 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200358
359 if (compact->utf8 == NULL)
360 assert(compact->utf8_length == 0);
361 if (ascii->wstr == NULL)
362 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200364 /* check that the best kind is used */
365 if (check_content && kind != PyUnicode_WCHAR_KIND)
366 {
367 Py_ssize_t i;
368 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200369 void *data;
370 Py_UCS4 ch;
371
372 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 for (i=0; i < ascii->length; i++)
374 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200375 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200376 if (ch > maxchar)
377 maxchar = ch;
378 }
379 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100380 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100382 assert(maxchar <= 255);
383 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 else
385 assert(maxchar < 128);
386 }
Victor Stinner77faf692011-11-20 18:56:05 +0100387 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100389 assert(maxchar <= 0xFFFF);
390 }
391 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100393 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100394 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200395 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400397 return 1;
398}
Victor Stinner910337b2011-10-03 03:20:16 +0200399#endif
400
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100401static PyObject*
402unicode_result_wchar(PyObject *unicode)
403{
404#ifndef Py_DEBUG
405 Py_ssize_t len;
406
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100407 len = _PyUnicode_WSTR_LENGTH(unicode);
408 if (len == 0) {
409 Py_INCREF(unicode_empty);
410 Py_DECREF(unicode);
411 return unicode_empty;
412 }
413
414 if (len == 1) {
415 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
416 if (ch < 256) {
417 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
418 Py_DECREF(unicode);
419 return latin1_char;
420 }
421 }
422
423 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200424 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425 return NULL;
426 }
427#else
Victor Stinneraa771272012-10-04 02:32:58 +0200428 assert(Py_REFCNT(unicode) == 1);
429
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100430 /* don't make the result ready in debug mode to ensure that the caller
431 makes the string ready before using it */
432 assert(_PyUnicode_CheckConsistency(unicode, 1));
433#endif
434 return unicode;
435}
436
437static PyObject*
438unicode_result_ready(PyObject *unicode)
439{
440 Py_ssize_t length;
441
442 length = PyUnicode_GET_LENGTH(unicode);
443 if (length == 0) {
444 if (unicode != unicode_empty) {
445 Py_INCREF(unicode_empty);
446 Py_DECREF(unicode);
447 }
448 return unicode_empty;
449 }
450
451 if (length == 1) {
452 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
453 if (ch < 256) {
454 PyObject *latin1_char = unicode_latin1[ch];
455 if (latin1_char != NULL) {
456 if (unicode != latin1_char) {
457 Py_INCREF(latin1_char);
458 Py_DECREF(unicode);
459 }
460 return latin1_char;
461 }
462 else {
463 assert(_PyUnicode_CheckConsistency(unicode, 1));
464 Py_INCREF(unicode);
465 unicode_latin1[ch] = unicode;
466 return unicode;
467 }
468 }
469 }
470
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 return unicode;
473}
474
475static PyObject*
476unicode_result(PyObject *unicode)
477{
478 assert(_PyUnicode_CHECK(unicode));
479 if (PyUnicode_IS_READY(unicode))
480 return unicode_result_ready(unicode);
481 else
482 return unicode_result_wchar(unicode);
483}
484
Victor Stinnerc4b49542011-12-11 22:44:26 +0100485static PyObject*
486unicode_result_unchanged(PyObject *unicode)
487{
488 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500489 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490 return NULL;
491 Py_INCREF(unicode);
492 return unicode;
493 }
494 else
495 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100496 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100497}
498
Victor Stinner3a50e702011-10-18 21:21:00 +0200499#ifdef HAVE_MBCS
500static OSVERSIONINFOEX winver;
501#endif
502
Thomas Wouters477c8d52006-05-27 19:21:47 +0000503/* --- Bloom Filters ----------------------------------------------------- */
504
505/* stuff to implement simple "bloom filters" for Unicode characters.
506 to keep things simple, we use a single bitmask, using the least 5
507 bits from each unicode characters as the bit index. */
508
509/* the linebreak mask is set up by Unicode_Init below */
510
Antoine Pitrouf068f942010-01-13 14:19:12 +0000511#if LONG_BIT >= 128
512#define BLOOM_WIDTH 128
513#elif LONG_BIT >= 64
514#define BLOOM_WIDTH 64
515#elif LONG_BIT >= 32
516#define BLOOM_WIDTH 32
517#else
518#error "LONG_BIT is smaller than 32"
519#endif
520
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521#define BLOOM_MASK unsigned long
522
523static BLOOM_MASK bloom_linebreak;
524
Antoine Pitrouf068f942010-01-13 14:19:12 +0000525#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
526#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000527
Benjamin Peterson29060642009-01-31 22:14:21 +0000528#define BLOOM_LINEBREAK(ch) \
529 ((ch) < 128U ? ascii_linebreak[(ch)] : \
530 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000531
Alexander Belopolsky40018472011-02-26 01:02:56 +0000532Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534{
535 /* calculate simple bloom-style bitmask for a given unicode string */
536
Antoine Pitrouf068f942010-01-13 14:19:12 +0000537 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538 Py_ssize_t i;
539
540 mask = 0;
541 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543
544 return mask;
545}
546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547#define BLOOM_MEMBER(mask, chr, str) \
548 (BLOOM(mask, chr) \
549 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000550
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200551/* Compilation of templated routines */
552
553#include "stringlib/asciilib.h"
554#include "stringlib/fastsearch.h"
555#include "stringlib/partition.h"
556#include "stringlib/split.h"
557#include "stringlib/count.h"
558#include "stringlib/find.h"
559#include "stringlib/find_max_char.h"
560#include "stringlib/localeutil.h"
561#include "stringlib/undef.h"
562
563#include "stringlib/ucs1lib.h"
564#include "stringlib/fastsearch.h"
565#include "stringlib/partition.h"
566#include "stringlib/split.h"
567#include "stringlib/count.h"
568#include "stringlib/find.h"
569#include "stringlib/find_max_char.h"
570#include "stringlib/localeutil.h"
571#include "stringlib/undef.h"
572
573#include "stringlib/ucs2lib.h"
574#include "stringlib/fastsearch.h"
575#include "stringlib/partition.h"
576#include "stringlib/split.h"
577#include "stringlib/count.h"
578#include "stringlib/find.h"
579#include "stringlib/find_max_char.h"
580#include "stringlib/localeutil.h"
581#include "stringlib/undef.h"
582
583#include "stringlib/ucs4lib.h"
584#include "stringlib/fastsearch.h"
585#include "stringlib/partition.h"
586#include "stringlib/split.h"
587#include "stringlib/count.h"
588#include "stringlib/find.h"
589#include "stringlib/find_max_char.h"
590#include "stringlib/localeutil.h"
591#include "stringlib/undef.h"
592
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200593#include "stringlib/unicodedefs.h"
594#include "stringlib/fastsearch.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100597#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599/* --- Unicode Object ----------------------------------------------------- */
600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200601static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200602fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200603
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200604Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
605 Py_ssize_t size, Py_UCS4 ch,
606 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200608 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
609
610 switch (kind) {
611 case PyUnicode_1BYTE_KIND:
612 {
613 Py_UCS1 ch1 = (Py_UCS1) ch;
614 if (ch1 == ch)
615 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
616 else
617 return -1;
618 }
619 case PyUnicode_2BYTE_KIND:
620 {
621 Py_UCS2 ch2 = (Py_UCS2) ch;
622 if (ch2 == ch)
623 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_4BYTE_KIND:
628 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
629 default:
630 assert(0);
631 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633}
634
Victor Stinnerafffce42012-10-03 23:03:17 +0200635#ifdef Py_DEBUG
636/* Fill the data of an Unicode string with invalid characters to detect bugs
637 earlier.
638
639 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
640 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
641 invalid character in Unicode 6.0. */
642static void
643unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
644{
645 int kind = PyUnicode_KIND(unicode);
646 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
647 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
648 if (length <= old_length)
649 return;
650 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
651}
652#endif
653
Victor Stinnerfe226c02011-10-03 03:52:20 +0200654static PyObject*
655resize_compact(PyObject *unicode, Py_ssize_t length)
656{
657 Py_ssize_t char_size;
658 Py_ssize_t struct_size;
659 Py_ssize_t new_size;
660 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100661 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200662#ifdef Py_DEBUG
663 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
664#endif
665
Victor Stinner79891572012-05-03 13:43:07 +0200666 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100668 assert(PyUnicode_IS_COMPACT(unicode));
669
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200670 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100671 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200672 struct_size = sizeof(PyASCIIObject);
673 else
674 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200675 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
678 PyErr_NoMemory();
679 return NULL;
680 }
681 new_size = (struct_size + (length + 1) * char_size);
682
Victor Stinner84def372011-12-11 20:04:56 +0100683 _Py_DEC_REFTOTAL;
684 _Py_ForgetReference(unicode);
685
686 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
687 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100688 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200689 PyErr_NoMemory();
690 return NULL;
691 }
Victor Stinner84def372011-12-11 20:04:56 +0100692 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200696 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100698 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200699 _PyUnicode_WSTR_LENGTH(unicode) = length;
700 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200701#ifdef Py_DEBUG
702 unicode_fill_invalid(unicode, old_length);
703#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
705 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200706 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200707 return unicode;
708}
709
Alexander Belopolsky40018472011-02-26 01:02:56 +0000710static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200711resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712{
Victor Stinner95663112011-10-04 01:03:50 +0200713 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100714 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 if (PyUnicode_IS_READY(unicode)) {
719 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200720 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200722#ifdef Py_DEBUG
723 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
724#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725
726 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200727 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200728 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
729 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730
731 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
732 PyErr_NoMemory();
733 return -1;
734 }
735 new_size = (length + 1) * char_size;
736
Victor Stinner7a9105a2011-12-12 00:13:42 +0100737 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
738 {
739 PyObject_DEL(_PyUnicode_UTF8(unicode));
740 _PyUnicode_UTF8(unicode) = NULL;
741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
742 }
743
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744 data = (PyObject *)PyObject_REALLOC(data, new_size);
745 if (data == NULL) {
746 PyErr_NoMemory();
747 return -1;
748 }
749 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200750 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200751 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200752 _PyUnicode_WSTR_LENGTH(unicode) = length;
753 }
754 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200755 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200756 _PyUnicode_UTF8_LENGTH(unicode) = length;
757 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200758 _PyUnicode_LENGTH(unicode) = length;
759 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200760#ifdef Py_DEBUG
761 unicode_fill_invalid(unicode, old_length);
762#endif
Victor Stinner95663112011-10-04 01:03:50 +0200763 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200764 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 }
Victor Stinner95663112011-10-04 01:03:50 +0200768 assert(_PyUnicode_WSTR(unicode) != NULL);
769
770 /* check for integer overflow */
771 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
772 PyErr_NoMemory();
773 return -1;
774 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100775 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200776 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100777 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200778 if (!wstr) {
779 PyErr_NoMemory();
780 return -1;
781 }
782 _PyUnicode_WSTR(unicode) = wstr;
783 _PyUnicode_WSTR(unicode)[length] = 0;
784 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200785 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000786 return 0;
787}
788
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789static PyObject*
790resize_copy(PyObject *unicode, Py_ssize_t length)
791{
792 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100793 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100795
Benjamin Petersonbac79492012-01-14 13:34:47 -0500796 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100797 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200798
799 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
800 if (copy == NULL)
801 return NULL;
802
803 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200804 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200805 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200806 }
807 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200808 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100809
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200810 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 if (w == NULL)
812 return NULL;
813 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
814 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200815 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
816 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200817 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200818 }
819}
820
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000822 Ux0000 terminated; some code (e.g. new_identifier)
823 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824
825 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000827
828*/
829
Alexander Belopolsky40018472011-02-26 01:02:56 +0000830static PyUnicodeObject *
831_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832{
833 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200834 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835
Thomas Wouters477c8d52006-05-27 19:21:47 +0000836 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000837 if (length == 0 && unicode_empty != NULL) {
838 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200839 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840 }
841
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000842 /* Ensure we won't overflow the size. */
843 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
844 return (PyUnicodeObject *)PyErr_NoMemory();
845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 if (length < 0) {
847 PyErr_SetString(PyExc_SystemError,
848 "Negative size passed to _PyUnicode_New");
849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000850 }
851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
853 if (unicode == NULL)
854 return NULL;
855 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
856 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
857 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100858 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000859 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100860 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200862
Jeremy Hyltond8082792003-09-16 19:41:39 +0000863 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000864 * the caller fails before initializing str -- unicode_resize()
865 * reads str[0], and the Keep-Alive optimization can keep memory
866 * allocated for str alive across a call to unicode_dealloc(unicode).
867 * We don't want unicode_resize to read uninitialized memory in
868 * that case.
869 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870 _PyUnicode_WSTR(unicode)[0] = 0;
871 _PyUnicode_WSTR(unicode)[length] = 0;
872 _PyUnicode_WSTR_LENGTH(unicode) = length;
873 _PyUnicode_HASH(unicode) = -1;
874 _PyUnicode_STATE(unicode).interned = 0;
875 _PyUnicode_STATE(unicode).kind = 0;
876 _PyUnicode_STATE(unicode).compact = 0;
877 _PyUnicode_STATE(unicode).ready = 0;
878 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200879 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200881 _PyUnicode_UTF8(unicode) = NULL;
882 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100883 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 return unicode;
885}
886
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887static const char*
888unicode_kind_name(PyObject *unicode)
889{
Victor Stinner42dfd712011-10-03 14:41:45 +0200890 /* don't check consistency: unicode_kind_name() is called from
891 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 if (!PyUnicode_IS_COMPACT(unicode))
893 {
894 if (!PyUnicode_IS_READY(unicode))
895 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600896 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 {
898 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200899 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 return "legacy ascii";
901 else
902 return "legacy latin1";
903 case PyUnicode_2BYTE_KIND:
904 return "legacy UCS2";
905 case PyUnicode_4BYTE_KIND:
906 return "legacy UCS4";
907 default:
908 return "<legacy invalid kind>";
909 }
910 }
911 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600912 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200913 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200914 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200915 return "ascii";
916 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200917 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200918 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200919 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200920 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200921 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200922 default:
923 return "<invalid compact kind>";
924 }
925}
926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928/* Functions wrapping macros for use in debugger */
929char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200930 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931}
932
933void *_PyUnicode_compact_data(void *unicode) {
934 return _PyUnicode_COMPACT_DATA(unicode);
935}
936void *_PyUnicode_data(void *unicode){
937 printf("obj %p\n", unicode);
938 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
939 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
940 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
941 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
942 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
943 return PyUnicode_DATA(unicode);
944}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200945
946void
947_PyUnicode_Dump(PyObject *op)
948{
949 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
951 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
952 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200953
Victor Stinnera849a4b2011-10-03 12:12:11 +0200954 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200955 {
956 if (ascii->state.ascii)
957 data = (ascii + 1);
958 else
959 data = (compact + 1);
960 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200961 else
962 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200963 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
964
Victor Stinnera849a4b2011-10-03 12:12:11 +0200965 if (ascii->wstr == data)
966 printf("shared ");
967 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200968
Victor Stinnera3b334d2011-10-03 13:53:37 +0200969 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200970 printf(" (%zu), ", compact->wstr_length);
971 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
972 printf("shared ");
973 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200974 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200975 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200976}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977#endif
978
979PyObject *
980PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
981{
982 PyObject *obj;
983 PyCompactUnicodeObject *unicode;
984 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200985 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200986 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 Py_ssize_t char_size;
988 Py_ssize_t struct_size;
989
990 /* Optimization for empty strings */
991 if (size == 0 && unicode_empty != NULL) {
992 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200993 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 }
995
Victor Stinner9e9d6892011-10-04 01:02:02 +0200996 is_ascii = 0;
997 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200998 struct_size = sizeof(PyCompactUnicodeObject);
999 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001000 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 char_size = 1;
1002 is_ascii = 1;
1003 struct_size = sizeof(PyASCIIObject);
1004 }
1005 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001006 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007 char_size = 1;
1008 }
1009 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001010 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001011 char_size = 2;
1012 if (sizeof(wchar_t) == 2)
1013 is_sharing = 1;
1014 }
1015 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001016 if (maxchar > MAX_UNICODE) {
1017 PyErr_SetString(PyExc_SystemError,
1018 "invalid maximum character passed to PyUnicode_New");
1019 return NULL;
1020 }
Victor Stinner8f825062012-04-27 13:55:39 +02001021 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001022 char_size = 4;
1023 if (sizeof(wchar_t) == 4)
1024 is_sharing = 1;
1025 }
1026
1027 /* Ensure we won't overflow the size. */
1028 if (size < 0) {
1029 PyErr_SetString(PyExc_SystemError,
1030 "Negative size passed to PyUnicode_New");
1031 return NULL;
1032 }
1033 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1034 return PyErr_NoMemory();
1035
1036 /* Duplicated allocation code from _PyObject_New() instead of a call to
1037 * PyObject_New() so we are able to allocate space for the object and
1038 * it's data buffer.
1039 */
1040 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1041 if (obj == NULL)
1042 return PyErr_NoMemory();
1043 obj = PyObject_INIT(obj, &PyUnicode_Type);
1044 if (obj == NULL)
1045 return NULL;
1046
1047 unicode = (PyCompactUnicodeObject *)obj;
1048 if (is_ascii)
1049 data = ((PyASCIIObject*)obj) + 1;
1050 else
1051 data = unicode + 1;
1052 _PyUnicode_LENGTH(unicode) = size;
1053 _PyUnicode_HASH(unicode) = -1;
1054 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001055 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 _PyUnicode_STATE(unicode).compact = 1;
1057 _PyUnicode_STATE(unicode).ready = 1;
1058 _PyUnicode_STATE(unicode).ascii = is_ascii;
1059 if (is_ascii) {
1060 ((char*)data)[size] = 0;
1061 _PyUnicode_WSTR(unicode) = NULL;
1062 }
Victor Stinner8f825062012-04-27 13:55:39 +02001063 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 ((char*)data)[size] = 0;
1065 _PyUnicode_WSTR(unicode) = NULL;
1066 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001068 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 else {
1071 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001072 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001073 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001075 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001076 ((Py_UCS4*)data)[size] = 0;
1077 if (is_sharing) {
1078 _PyUnicode_WSTR_LENGTH(unicode) = size;
1079 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1080 }
1081 else {
1082 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1083 _PyUnicode_WSTR(unicode) = NULL;
1084 }
1085 }
Victor Stinner8f825062012-04-27 13:55:39 +02001086#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001087 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001088#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001089 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 return obj;
1091}
1092
1093#if SIZEOF_WCHAR_T == 2
1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1095 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001096 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097
1098 This function assumes that unicode can hold one more code point than wstr
1099 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001100static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103{
1104 const wchar_t *iter;
1105 Py_UCS4 *ucs4_out;
1106
Victor Stinner910337b2011-10-03 03:20:16 +02001107 assert(unicode != NULL);
1108 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1110 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1111
1112 for (iter = begin; iter < end; ) {
1113 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1114 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001115 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1116 && (iter+1) < end
1117 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 {
Victor Stinner551ac952011-11-29 22:58:13 +01001119 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 iter += 2;
1121 }
1122 else {
1123 *ucs4_out++ = *iter;
1124 iter++;
1125 }
1126 }
1127 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1128 _PyUnicode_GET_LENGTH(unicode)));
1129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130}
1131#endif
1132
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133static int
Victor Stinner488fa492011-12-12 00:01:39 +01001134unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001135{
Victor Stinner488fa492011-12-12 00:01:39 +01001136 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001137 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001138 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return -1;
1140 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141 return 0;
1142}
1143
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144static int
1145_copy_characters(PyObject *to, Py_ssize_t to_start,
1146 PyObject *from, Py_ssize_t from_start,
1147 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 unsigned int from_kind, to_kind;
1150 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151
Victor Stinneree4544c2012-05-09 22:24:08 +02001152 assert(0 <= how_many);
1153 assert(0 <= from_start);
1154 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001155 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001157 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158
Victor Stinnerd3f08822012-05-29 12:57:52 +02001159 assert(PyUnicode_Check(to));
1160 assert(PyUnicode_IS_READY(to));
1161 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1162
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001163 if (how_many == 0)
1164 return 0;
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001167 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170
Victor Stinnerf1852262012-06-16 16:38:26 +02001171#ifdef Py_DEBUG
1172 if (!check_maxchar
1173 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1174 {
1175 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1176 Py_UCS4 ch;
1177 Py_ssize_t i;
1178 for (i=0; i < how_many; i++) {
1179 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1180 assert(ch <= to_maxchar);
1181 }
1182 }
1183#endif
1184
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001185 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001186 if (check_maxchar
1187 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1188 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001189 /* Writing Latin-1 characters into an ASCII string requires to
1190 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001191 Py_UCS4 max_char;
1192 max_char = ucs1lib_find_max_char(from_data,
1193 (Py_UCS1*)from_data + how_many);
1194 if (max_char >= 128)
1195 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001196 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001197 Py_MEMCPY((char*)to_data + to_kind * to_start,
1198 (char*)from_data + from_kind * from_start,
1199 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001201 else if (from_kind == PyUnicode_1BYTE_KIND
1202 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 {
1204 _PyUnicode_CONVERT_BYTES(
1205 Py_UCS1, Py_UCS2,
1206 PyUnicode_1BYTE_DATA(from) + from_start,
1207 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1208 PyUnicode_2BYTE_DATA(to) + to_start
1209 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001210 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001211 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001212 && to_kind == PyUnicode_4BYTE_KIND)
1213 {
1214 _PyUnicode_CONVERT_BYTES(
1215 Py_UCS1, Py_UCS4,
1216 PyUnicode_1BYTE_DATA(from) + from_start,
1217 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1218 PyUnicode_4BYTE_DATA(to) + to_start
1219 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001220 }
1221 else if (from_kind == PyUnicode_2BYTE_KIND
1222 && to_kind == PyUnicode_4BYTE_KIND)
1223 {
1224 _PyUnicode_CONVERT_BYTES(
1225 Py_UCS2, Py_UCS4,
1226 PyUnicode_2BYTE_DATA(from) + from_start,
1227 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1228 PyUnicode_4BYTE_DATA(to) + to_start
1229 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001230 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001231 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001232 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1233
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001234 if (!check_maxchar) {
1235 if (from_kind == PyUnicode_2BYTE_KIND
1236 && to_kind == PyUnicode_1BYTE_KIND)
1237 {
1238 _PyUnicode_CONVERT_BYTES(
1239 Py_UCS2, Py_UCS1,
1240 PyUnicode_2BYTE_DATA(from) + from_start,
1241 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1242 PyUnicode_1BYTE_DATA(to) + to_start
1243 );
1244 }
1245 else if (from_kind == PyUnicode_4BYTE_KIND
1246 && to_kind == PyUnicode_1BYTE_KIND)
1247 {
1248 _PyUnicode_CONVERT_BYTES(
1249 Py_UCS4, Py_UCS1,
1250 PyUnicode_4BYTE_DATA(from) + from_start,
1251 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1252 PyUnicode_1BYTE_DATA(to) + to_start
1253 );
1254 }
1255 else if (from_kind == PyUnicode_4BYTE_KIND
1256 && to_kind == PyUnicode_2BYTE_KIND)
1257 {
1258 _PyUnicode_CONVERT_BYTES(
1259 Py_UCS4, Py_UCS2,
1260 PyUnicode_4BYTE_DATA(from) + from_start,
1261 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1262 PyUnicode_2BYTE_DATA(to) + to_start
1263 );
1264 }
1265 else {
1266 assert(0);
1267 return -1;
1268 }
1269 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001270 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001271 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001272 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001273 Py_ssize_t i;
1274
Victor Stinnera0702ab2011-09-29 14:14:38 +02001275 for (i=0; i < how_many; i++) {
1276 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001277 if (ch > to_maxchar)
1278 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001279 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1280 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001281 }
1282 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001283 return 0;
1284}
1285
Victor Stinnerd3f08822012-05-29 12:57:52 +02001286void
1287_PyUnicode_FastCopyCharacters(
1288 PyObject *to, Py_ssize_t to_start,
1289 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001290{
1291 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1292}
1293
1294Py_ssize_t
1295PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1296 PyObject *from, Py_ssize_t from_start,
1297 Py_ssize_t how_many)
1298{
1299 int err;
1300
1301 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1302 PyErr_BadInternalCall();
1303 return -1;
1304 }
1305
Benjamin Petersonbac79492012-01-14 13:34:47 -05001306 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001307 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001308 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001309 return -1;
1310
Victor Stinnerd3f08822012-05-29 12:57:52 +02001311 if (from_start < 0) {
1312 PyErr_SetString(PyExc_IndexError, "string index out of range");
1313 return -1;
1314 }
1315 if (to_start < 0) {
1316 PyErr_SetString(PyExc_IndexError, "string index out of range");
1317 return -1;
1318 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001319 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1320 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1321 PyErr_Format(PyExc_SystemError,
1322 "Cannot write %zi characters at %zi "
1323 "in a string of %zi characters",
1324 how_many, to_start, PyUnicode_GET_LENGTH(to));
1325 return -1;
1326 }
1327
1328 if (how_many == 0)
1329 return 0;
1330
Victor Stinner488fa492011-12-12 00:01:39 +01001331 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001332 return -1;
1333
1334 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1335 if (err) {
1336 PyErr_Format(PyExc_SystemError,
1337 "Cannot copy %s characters "
1338 "into a string of %s characters",
1339 unicode_kind_name(from),
1340 unicode_kind_name(to));
1341 return -1;
1342 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001343 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344}
1345
Victor Stinner17222162011-09-28 22:15:37 +02001346/* Find the maximum code point and count the number of surrogate pairs so a
1347 correct string length can be computed before converting a string to UCS4.
1348 This function counts single surrogates as a character and not as a pair.
1349
1350 Return 0 on success, or -1 on error. */
1351static int
1352find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1353 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354{
1355 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001356 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357
Victor Stinnerc53be962011-10-02 21:33:54 +02001358 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 *num_surrogates = 0;
1360 *maxchar = 0;
1361
1362 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001364 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1365 && (iter+1) < end
1366 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001368 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 iter += 2;
1371 }
1372 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001374 {
1375 ch = *iter;
1376 iter++;
1377 }
1378 if (ch > *maxchar) {
1379 *maxchar = ch;
1380 if (*maxchar > MAX_UNICODE) {
1381 PyErr_Format(PyExc_ValueError,
1382 "character U+%x is not in range [U+0000; U+10ffff]",
1383 ch);
1384 return -1;
1385 }
1386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 }
1388 return 0;
1389}
1390
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001391int
1392_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393{
1394 wchar_t *end;
1395 Py_UCS4 maxchar = 0;
1396 Py_ssize_t num_surrogates;
1397#if SIZEOF_WCHAR_T == 2
1398 Py_ssize_t length_wo_surrogates;
1399#endif
1400
Georg Brandl7597add2011-10-05 16:36:47 +02001401 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001402 strings were created using _PyObject_New() and where no canonical
1403 representation (the str field) has been set yet aka strings
1404 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001405 assert(_PyUnicode_CHECK(unicode));
1406 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001408 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001409 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001410 /* Actually, it should neither be interned nor be anything else: */
1411 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001414 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001415 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417
1418 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001419 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1420 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 PyErr_NoMemory();
1422 return -1;
1423 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001424 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 _PyUnicode_WSTR(unicode), end,
1426 PyUnicode_1BYTE_DATA(unicode));
1427 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1428 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1429 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1430 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001431 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001432 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001433 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 }
1435 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001436 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001437 _PyUnicode_UTF8(unicode) = NULL;
1438 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 }
1440 PyObject_FREE(_PyUnicode_WSTR(unicode));
1441 _PyUnicode_WSTR(unicode) = NULL;
1442 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1443 }
1444 /* In this case we might have to convert down from 4-byte native
1445 wchar_t to 2-byte unicode. */
1446 else if (maxchar < 65536) {
1447 assert(num_surrogates == 0 &&
1448 "FindMaxCharAndNumSurrogatePairs() messed up");
1449
Victor Stinner506f5922011-09-28 22:34:18 +02001450#if SIZEOF_WCHAR_T == 2
1451 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001452 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001453 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1454 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1455 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 _PyUnicode_UTF8(unicode) = NULL;
1457 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001458#else
1459 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001460 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001461 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001462 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001463 PyErr_NoMemory();
1464 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 }
Victor Stinner506f5922011-09-28 22:34:18 +02001466 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1467 _PyUnicode_WSTR(unicode), end,
1468 PyUnicode_2BYTE_DATA(unicode));
1469 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1470 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1471 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001472 _PyUnicode_UTF8(unicode) = NULL;
1473 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001474 PyObject_FREE(_PyUnicode_WSTR(unicode));
1475 _PyUnicode_WSTR(unicode) = NULL;
1476 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1477#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 }
1479 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1480 else {
1481#if SIZEOF_WCHAR_T == 2
1482 /* in case the native representation is 2-bytes, we need to allocate a
1483 new normalized 4-byte version. */
1484 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001485 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1486 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 PyErr_NoMemory();
1488 return -1;
1489 }
1490 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1491 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001492 _PyUnicode_UTF8(unicode) = NULL;
1493 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001494 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1495 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001496 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 PyObject_FREE(_PyUnicode_WSTR(unicode));
1498 _PyUnicode_WSTR(unicode) = NULL;
1499 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1500#else
1501 assert(num_surrogates == 0);
1502
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001505 _PyUnicode_UTF8(unicode) = NULL;
1506 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1508#endif
1509 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1510 }
1511 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001512 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001513 return 0;
1514}
1515
Alexander Belopolsky40018472011-02-26 01:02:56 +00001516static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001517unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518{
Walter Dörwald16807132007-05-25 13:52:07 +00001519 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001520 case SSTATE_NOT_INTERNED:
1521 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001522
Benjamin Peterson29060642009-01-31 22:14:21 +00001523 case SSTATE_INTERNED_MORTAL:
1524 /* revive dead object temporarily for DelItem */
1525 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001526 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001527 Py_FatalError(
1528 "deletion of interned string failed");
1529 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 case SSTATE_INTERNED_IMMORTAL:
1532 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001533
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 default:
1535 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001536 }
1537
Victor Stinner03490912011-10-03 23:45:12 +02001538 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001539 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001540 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001541 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001542 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1543 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001545 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001546}
1547
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548#ifdef Py_DEBUG
1549static int
1550unicode_is_singleton(PyObject *unicode)
1551{
1552 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1553 if (unicode == unicode_empty)
1554 return 1;
1555 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1556 {
1557 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1558 if (ch < 256 && unicode_latin1[ch] == unicode)
1559 return 1;
1560 }
1561 return 0;
1562}
1563#endif
1564
Alexander Belopolsky40018472011-02-26 01:02:56 +00001565static int
Victor Stinner488fa492011-12-12 00:01:39 +01001566unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001567{
Victor Stinner488fa492011-12-12 00:01:39 +01001568 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001569 if (Py_REFCNT(unicode) != 1)
1570 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (_PyUnicode_HASH(unicode) != -1)
1572 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001573 if (PyUnicode_CHECK_INTERNED(unicode))
1574 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001575 if (!PyUnicode_CheckExact(unicode))
1576 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001577#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001578 /* singleton refcount is greater than 1 */
1579 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001580#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 return 1;
1582}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584static int
1585unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1586{
1587 PyObject *unicode;
1588 Py_ssize_t old_length;
1589
1590 assert(p_unicode != NULL);
1591 unicode = *p_unicode;
1592
1593 assert(unicode != NULL);
1594 assert(PyUnicode_Check(unicode));
1595 assert(0 <= length);
1596
Victor Stinner910337b2011-10-03 03:20:16 +02001597 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001598 old_length = PyUnicode_WSTR_LENGTH(unicode);
1599 else
1600 old_length = PyUnicode_GET_LENGTH(unicode);
1601 if (old_length == length)
1602 return 0;
1603
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001604 if (length == 0) {
1605 Py_DECREF(*p_unicode);
1606 *p_unicode = unicode_empty;
1607 Py_INCREF(*p_unicode);
1608 return 0;
1609 }
1610
Victor Stinner488fa492011-12-12 00:01:39 +01001611 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001612 PyObject *copy = resize_copy(unicode, length);
1613 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001614 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 Py_DECREF(*p_unicode);
1616 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618 }
1619
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001621 PyObject *new_unicode = resize_compact(unicode, length);
1622 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001624 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001625 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001626 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001627 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001628}
1629
Alexander Belopolsky40018472011-02-26 01:02:56 +00001630int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 PyObject *unicode;
1634 if (p_unicode == NULL) {
1635 PyErr_BadInternalCall();
1636 return -1;
1637 }
1638 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001639 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001640 {
1641 PyErr_BadInternalCall();
1642 return -1;
1643 }
1644 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001645}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001646
Victor Stinnerc5166102012-02-22 13:55:02 +01001647/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001648
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001649 WARNING: The function doesn't copy the terminating null character and
1650 doesn't check the maximum character (may write a latin1 character in an
1651 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001652static void
1653unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1654 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001655{
1656 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1657 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001658 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001659
1660 switch (kind) {
1661 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001662 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001663#ifdef Py_DEBUG
1664 if (PyUnicode_IS_ASCII(unicode)) {
1665 Py_UCS4 maxchar = ucs1lib_find_max_char(
1666 (const Py_UCS1*)str,
1667 (const Py_UCS1*)str + len);
1668 assert(maxchar < 128);
1669 }
1670#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001671 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001672 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001673 }
1674 case PyUnicode_2BYTE_KIND: {
1675 Py_UCS2 *start = (Py_UCS2 *)data + index;
1676 Py_UCS2 *ucs2 = start;
1677 assert(index <= PyUnicode_GET_LENGTH(unicode));
1678
Victor Stinner184252a2012-06-16 02:57:41 +02001679 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001680 *ucs2 = (Py_UCS2)*str;
1681
1682 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001683 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001684 }
1685 default: {
1686 Py_UCS4 *start = (Py_UCS4 *)data + index;
1687 Py_UCS4 *ucs4 = start;
1688 assert(kind == PyUnicode_4BYTE_KIND);
1689 assert(index <= PyUnicode_GET_LENGTH(unicode));
1690
Victor Stinner184252a2012-06-16 02:57:41 +02001691 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001692 *ucs4 = (Py_UCS4)*str;
1693
1694 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001695 }
1696 }
1697}
1698
1699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700static PyObject*
1701get_latin1_char(unsigned char ch)
1702{
Victor Stinnera464fc12011-10-02 20:39:30 +02001703 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001705 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 if (!unicode)
1707 return NULL;
1708 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001709 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 unicode_latin1[ch] = unicode;
1711 }
1712 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001713 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714}
1715
Alexander Belopolsky40018472011-02-26 01:02:56 +00001716PyObject *
1717PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001719 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720 Py_UCS4 maxchar = 0;
1721 Py_ssize_t num_surrogates;
1722
1723 if (u == NULL)
1724 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001726 /* If the Unicode data is known at construction time, we can apply
1727 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 /* Optimization for empty strings */
1730 if (size == 0 && unicode_empty != NULL) {
1731 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001732 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001733 }
Tim Petersced69f82003-09-16 20:30:58 +00001734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 /* Single character Unicode objects in the Latin-1 range are
1736 shared when using this constructor */
1737 if (size == 1 && *u < 256)
1738 return get_latin1_char((unsigned char)*u);
1739
1740 /* If not empty and not single character, copy the Unicode data
1741 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001742 if (find_maxchar_surrogates(u, u + size,
1743 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 return NULL;
1745
Victor Stinner8faf8212011-12-08 22:14:11 +01001746 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 if (!unicode)
1748 return NULL;
1749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 switch (PyUnicode_KIND(unicode)) {
1751 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001752 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1754 break;
1755 case PyUnicode_2BYTE_KIND:
1756#if Py_UNICODE_SIZE == 2
1757 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1758#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001759 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1761#endif
1762 break;
1763 case PyUnicode_4BYTE_KIND:
1764#if SIZEOF_WCHAR_T == 2
1765 /* This is the only case which has to process surrogates, thus
1766 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001767 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768#else
1769 assert(num_surrogates == 0);
1770 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1771#endif
1772 break;
1773 default:
1774 assert(0 && "Impossible state");
1775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001777 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778}
1779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780PyObject *
1781PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001782{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001783 if (size < 0) {
1784 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001786 return NULL;
1787 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001788 if (u != NULL)
1789 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1790 else
1791 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001792}
1793
Alexander Belopolsky40018472011-02-26 01:02:56 +00001794PyObject *
1795PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001796{
1797 size_t size = strlen(u);
1798 if (size > PY_SSIZE_T_MAX) {
1799 PyErr_SetString(PyExc_OverflowError, "input too long");
1800 return NULL;
1801 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001802 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001803}
1804
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001805PyObject *
1806_PyUnicode_FromId(_Py_Identifier *id)
1807{
1808 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001809 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1810 strlen(id->string),
1811 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001812 if (!id->object)
1813 return NULL;
1814 PyUnicode_InternInPlace(&id->object);
1815 assert(!id->next);
1816 id->next = static_strings;
1817 static_strings = id;
1818 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001819 return id->object;
1820}
1821
1822void
1823_PyUnicode_ClearStaticStrings()
1824{
1825 _Py_Identifier *i;
1826 for (i = static_strings; i; i = i->next) {
1827 Py_DECREF(i->object);
1828 i->object = NULL;
1829 i->next = NULL;
1830 }
1831}
1832
Benjamin Peterson0df54292012-03-26 14:50:32 -04001833/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834
Victor Stinnerd3f08822012-05-29 12:57:52 +02001835PyObject*
1836_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001837{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001838 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001839 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001840 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001841#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001842 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001843#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001844 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001845 }
Victor Stinner785938e2011-12-11 20:09:03 +01001846 unicode = PyUnicode_New(size, 127);
1847 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001848 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001849 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1850 assert(_PyUnicode_CheckConsistency(unicode, 1));
1851 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001852}
1853
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001854static Py_UCS4
1855kind_maxchar_limit(unsigned int kind)
1856{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001857 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001858 case PyUnicode_1BYTE_KIND:
1859 return 0x80;
1860 case PyUnicode_2BYTE_KIND:
1861 return 0x100;
1862 case PyUnicode_4BYTE_KIND:
1863 return 0x10000;
1864 default:
1865 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001866 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001867 }
1868}
1869
Victor Stinnere6abb482012-05-02 01:15:40 +02001870Py_LOCAL_INLINE(Py_UCS4)
1871align_maxchar(Py_UCS4 maxchar)
1872{
1873 if (maxchar <= 127)
1874 return 127;
1875 else if (maxchar <= 255)
1876 return 255;
1877 else if (maxchar <= 65535)
1878 return 65535;
1879 else
1880 return MAX_UNICODE;
1881}
1882
Victor Stinner702c7342011-10-05 13:50:52 +02001883static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001884_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001887 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001888
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001889 if (size == 0) {
1890 Py_INCREF(unicode_empty);
1891 return unicode_empty;
1892 }
1893 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001894 if (size == 1)
1895 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001896
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001897 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001898 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001899 if (!res)
1900 return NULL;
1901 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001902 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001904}
1905
Victor Stinnere57b1c02011-09-28 22:20:48 +02001906static PyObject*
1907_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908{
1909 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001910 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001911
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001912 if (size == 0) {
1913 Py_INCREF(unicode_empty);
1914 return unicode_empty;
1915 }
1916 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001917 if (size == 1) {
1918 Py_UCS4 ch = u[0];
1919 if (ch < 256)
1920 return get_latin1_char((unsigned char)ch);
1921
1922 res = PyUnicode_New(1, ch);
1923 if (res == NULL)
1924 return NULL;
1925 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1926 assert(_PyUnicode_CheckConsistency(res, 1));
1927 return res;
1928 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001929
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001930 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001931 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 if (!res)
1933 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001934 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001936 else {
1937 _PyUnicode_CONVERT_BYTES(
1938 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1939 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001940 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941 return res;
1942}
1943
Victor Stinnere57b1c02011-09-28 22:20:48 +02001944static PyObject*
1945_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946{
1947 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001948 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001949
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001950 if (size == 0) {
1951 Py_INCREF(unicode_empty);
1952 return unicode_empty;
1953 }
1954 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001955 if (size == 1) {
1956 Py_UCS4 ch = u[0];
1957 if (ch < 256)
1958 return get_latin1_char((unsigned char)ch);
1959
1960 res = PyUnicode_New(1, ch);
1961 if (res == NULL)
1962 return NULL;
1963 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1964 assert(_PyUnicode_CheckConsistency(res, 1));
1965 return res;
1966 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001967
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001968 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001969 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001970 if (!res)
1971 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001972 if (max_char < 256)
1973 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1974 PyUnicode_1BYTE_DATA(res));
1975 else if (max_char < 0x10000)
1976 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1977 PyUnicode_2BYTE_DATA(res));
1978 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 return res;
1982}
1983
1984PyObject*
1985PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1986{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001987 if (size < 0) {
1988 PyErr_SetString(PyExc_ValueError, "size must be positive");
1989 return NULL;
1990 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001991 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001993 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001995 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001997 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001998 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001999 PyErr_SetString(PyExc_SystemError, "invalid kind");
2000 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002}
2003
Victor Stinnerece58de2012-04-23 23:36:38 +02002004Py_UCS4
2005_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2006{
2007 enum PyUnicode_Kind kind;
2008 void *startptr, *endptr;
2009
2010 assert(PyUnicode_IS_READY(unicode));
2011 assert(0 <= start);
2012 assert(end <= PyUnicode_GET_LENGTH(unicode));
2013 assert(start <= end);
2014
2015 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2016 return PyUnicode_MAX_CHAR_VALUE(unicode);
2017
2018 if (start == end)
2019 return 127;
2020
Victor Stinner94d558b2012-04-27 22:26:58 +02002021 if (PyUnicode_IS_ASCII(unicode))
2022 return 127;
2023
Victor Stinnerece58de2012-04-23 23:36:38 +02002024 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002025 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002026 endptr = (char *)startptr + end * kind;
2027 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002028 switch(kind) {
2029 case PyUnicode_1BYTE_KIND:
2030 return ucs1lib_find_max_char(startptr, endptr);
2031 case PyUnicode_2BYTE_KIND:
2032 return ucs2lib_find_max_char(startptr, endptr);
2033 case PyUnicode_4BYTE_KIND:
2034 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002035 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002036 assert(0);
2037 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002038 }
2039}
2040
Victor Stinner25a4b292011-10-06 12:31:55 +02002041/* Ensure that a string uses the most efficient storage, if it is not the
2042 case: create a new string with of the right kind. Write NULL into *p_unicode
2043 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002044static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002045unicode_adjust_maxchar(PyObject **p_unicode)
2046{
2047 PyObject *unicode, *copy;
2048 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002049 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002050 unsigned int kind;
2051
2052 assert(p_unicode != NULL);
2053 unicode = *p_unicode;
2054 assert(PyUnicode_IS_READY(unicode));
2055 if (PyUnicode_IS_ASCII(unicode))
2056 return;
2057
2058 len = PyUnicode_GET_LENGTH(unicode);
2059 kind = PyUnicode_KIND(unicode);
2060 if (kind == PyUnicode_1BYTE_KIND) {
2061 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002062 max_char = ucs1lib_find_max_char(u, u + len);
2063 if (max_char >= 128)
2064 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002065 }
2066 else if (kind == PyUnicode_2BYTE_KIND) {
2067 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002068 max_char = ucs2lib_find_max_char(u, u + len);
2069 if (max_char >= 256)
2070 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002071 }
2072 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002073 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002074 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002075 max_char = ucs4lib_find_max_char(u, u + len);
2076 if (max_char >= 0x10000)
2077 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002078 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002079 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002080 if (copy != NULL)
2081 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002082 Py_DECREF(unicode);
2083 *p_unicode = copy;
2084}
2085
Victor Stinner034f6cf2011-09-30 02:26:44 +02002086PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002087_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002088{
Victor Stinner87af4f22011-11-21 23:03:47 +01002089 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002090 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002091
Victor Stinner034f6cf2011-09-30 02:26:44 +02002092 if (!PyUnicode_Check(unicode)) {
2093 PyErr_BadInternalCall();
2094 return NULL;
2095 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002096 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002097 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002098
Victor Stinner87af4f22011-11-21 23:03:47 +01002099 length = PyUnicode_GET_LENGTH(unicode);
2100 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002101 if (!copy)
2102 return NULL;
2103 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2104
Victor Stinner87af4f22011-11-21 23:03:47 +01002105 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2106 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002107 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002108 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002109}
2110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002111
Victor Stinnerbc603d12011-10-02 01:00:40 +02002112/* Widen Unicode objects to larger buffers. Don't write terminating null
2113 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002114
2115void*
2116_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2117{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002118 Py_ssize_t len;
2119 void *result;
2120 unsigned int skind;
2121
Benjamin Petersonbac79492012-01-14 13:34:47 -05002122 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002123 return NULL;
2124
2125 len = PyUnicode_GET_LENGTH(s);
2126 skind = PyUnicode_KIND(s);
2127 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002128 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 return NULL;
2130 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002131 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002132 case PyUnicode_2BYTE_KIND:
2133 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2134 if (!result)
2135 return PyErr_NoMemory();
2136 assert(skind == PyUnicode_1BYTE_KIND);
2137 _PyUnicode_CONVERT_BYTES(
2138 Py_UCS1, Py_UCS2,
2139 PyUnicode_1BYTE_DATA(s),
2140 PyUnicode_1BYTE_DATA(s) + len,
2141 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002143 case PyUnicode_4BYTE_KIND:
2144 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2145 if (!result)
2146 return PyErr_NoMemory();
2147 if (skind == PyUnicode_2BYTE_KIND) {
2148 _PyUnicode_CONVERT_BYTES(
2149 Py_UCS2, Py_UCS4,
2150 PyUnicode_2BYTE_DATA(s),
2151 PyUnicode_2BYTE_DATA(s) + len,
2152 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002154 else {
2155 assert(skind == PyUnicode_1BYTE_KIND);
2156 _PyUnicode_CONVERT_BYTES(
2157 Py_UCS1, Py_UCS4,
2158 PyUnicode_1BYTE_DATA(s),
2159 PyUnicode_1BYTE_DATA(s) + len,
2160 result);
2161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163 default:
2164 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 }
Victor Stinner01698042011-10-04 00:04:26 +02002166 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167 return NULL;
2168}
2169
2170static Py_UCS4*
2171as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2172 int copy_null)
2173{
2174 int kind;
2175 void *data;
2176 Py_ssize_t len, targetlen;
2177 if (PyUnicode_READY(string) == -1)
2178 return NULL;
2179 kind = PyUnicode_KIND(string);
2180 data = PyUnicode_DATA(string);
2181 len = PyUnicode_GET_LENGTH(string);
2182 targetlen = len;
2183 if (copy_null)
2184 targetlen++;
2185 if (!target) {
2186 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2187 PyErr_NoMemory();
2188 return NULL;
2189 }
2190 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2191 if (!target) {
2192 PyErr_NoMemory();
2193 return NULL;
2194 }
2195 }
2196 else {
2197 if (targetsize < targetlen) {
2198 PyErr_Format(PyExc_SystemError,
2199 "string is longer than the buffer");
2200 if (copy_null && 0 < targetsize)
2201 target[0] = 0;
2202 return NULL;
2203 }
2204 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002205 if (kind == PyUnicode_1BYTE_KIND) {
2206 Py_UCS1 *start = (Py_UCS1 *) data;
2207 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002209 else if (kind == PyUnicode_2BYTE_KIND) {
2210 Py_UCS2 *start = (Py_UCS2 *) data;
2211 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2212 }
2213 else {
2214 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 if (copy_null)
2218 target[len] = 0;
2219 return target;
2220}
2221
2222Py_UCS4*
2223PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2224 int copy_null)
2225{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002226 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 PyErr_BadInternalCall();
2228 return NULL;
2229 }
2230 return as_ucs4(string, target, targetsize, copy_null);
2231}
2232
2233Py_UCS4*
2234PyUnicode_AsUCS4Copy(PyObject *string)
2235{
2236 return as_ucs4(string, NULL, 0, 1);
2237}
2238
2239#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002240
Alexander Belopolsky40018472011-02-26 01:02:56 +00002241PyObject *
2242PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002245 if (size == 0) {
2246 Py_INCREF(unicode_empty);
2247 return unicode_empty;
2248 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002249 PyErr_BadInternalCall();
2250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 }
2252
Martin v. Löwis790465f2008-04-05 20:41:37 +00002253 if (size == -1) {
2254 size = wcslen(w);
2255 }
2256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258}
2259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002261
Walter Dörwald346737f2007-05-31 10:44:43 +00002262static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002263makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002264 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002265{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002267 if (longflag)
2268 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002269 else if (longlongflag) {
2270 /* longlongflag should only ever be nonzero on machines with
2271 HAVE_LONG_LONG defined */
2272#ifdef HAVE_LONG_LONG
2273 char *f = PY_FORMAT_LONG_LONG;
2274 while (*f)
2275 *fmt++ = *f++;
2276#else
2277 /* we shouldn't ever get here */
2278 assert(0);
2279 *fmt++ = 'l';
2280#endif
2281 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 else if (size_tflag) {
2283 char *f = PY_FORMAT_SIZE_T;
2284 while (*f)
2285 *fmt++ = *f++;
2286 }
2287 *fmt++ = c;
2288 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002289}
2290
Victor Stinner15a11362012-10-06 23:48:20 +02002291/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002292 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2293 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2294#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002295
2296static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002297unicode_fromformat_arg(_PyUnicodeWriter *writer,
2298 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002299{
Victor Stinnere215d962012-10-06 23:03:36 +02002300 const char *p;
2301 Py_ssize_t len;
2302 int zeropad;
2303 int width;
2304 int precision;
2305 int longflag;
2306 int longlongflag;
2307 int size_tflag;
2308 int fill;
2309
2310 p = f;
2311 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002312 zeropad = 0;
2313 if (*f == '0') {
2314 zeropad = 1;
2315 f++;
2316 }
Victor Stinner96865452011-03-01 23:44:09 +00002317
2318 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002319 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002320 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002321 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2322 PyErr_SetString(PyExc_ValueError,
2323 "width too big");
2324 return NULL;
2325 }
Victor Stinnere215d962012-10-06 23:03:36 +02002326 width = (width*10) + (*f - '0');
2327 f++;
2328 }
Victor Stinner96865452011-03-01 23:44:09 +00002329 precision = 0;
2330 if (*f == '.') {
2331 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002332 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002333 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2334 PyErr_SetString(PyExc_ValueError,
2335 "precision too big");
2336 return NULL;
2337 }
Victor Stinnere215d962012-10-06 23:03:36 +02002338 precision = (precision*10) + (*f - '0');
2339 f++;
2340 }
Victor Stinner96865452011-03-01 23:44:09 +00002341 if (*f == '%') {
2342 /* "%.3%s" => f points to "3" */
2343 f--;
2344 }
2345 }
2346 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002347 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002348 f--;
2349 }
Victor Stinner96865452011-03-01 23:44:09 +00002350
2351 /* Handle %ld, %lu, %lld and %llu. */
2352 longflag = 0;
2353 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002354 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002355 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002356 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002357 longflag = 1;
2358 ++f;
2359 }
2360#ifdef HAVE_LONG_LONG
2361 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002362 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002363 longlongflag = 1;
2364 f += 2;
2365 }
2366#endif
2367 }
2368 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002369 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002370 size_tflag = 1;
2371 ++f;
2372 }
Victor Stinnere215d962012-10-06 23:03:36 +02002373
2374 if (f[1] == '\0')
2375 writer->overallocate = 0;
2376
2377 switch (*f) {
2378 case 'c':
2379 {
2380 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002381 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2382 PyErr_SetString(PyExc_ValueError,
2383 "character argument not in range(0x110000)");
2384 return NULL;
2385 }
Victor Stinnere215d962012-10-06 23:03:36 +02002386 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2387 return NULL;
2388 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2389 writer->pos++;
2390 break;
2391 }
2392
2393 case 'i':
2394 case 'd':
2395 case 'u':
2396 case 'x':
2397 {
2398 /* used by sprintf */
2399 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002400 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002401
2402 if (*f == 'u') {
2403 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2404
2405 if (longflag)
2406 len = sprintf(buffer, fmt,
2407 va_arg(*vargs, unsigned long));
2408#ifdef HAVE_LONG_LONG
2409 else if (longlongflag)
2410 len = sprintf(buffer, fmt,
2411 va_arg(*vargs, unsigned PY_LONG_LONG));
2412#endif
2413 else if (size_tflag)
2414 len = sprintf(buffer, fmt,
2415 va_arg(*vargs, size_t));
2416 else
2417 len = sprintf(buffer, fmt,
2418 va_arg(*vargs, unsigned int));
2419 }
2420 else if (*f == 'x') {
2421 makefmt(fmt, 0, 0, 0, 'x');
2422 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2423 }
2424 else {
2425 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2426
2427 if (longflag)
2428 len = sprintf(buffer, fmt,
2429 va_arg(*vargs, long));
2430#ifdef HAVE_LONG_LONG
2431 else if (longlongflag)
2432 len = sprintf(buffer, fmt,
2433 va_arg(*vargs, PY_LONG_LONG));
2434#endif
2435 else if (size_tflag)
2436 len = sprintf(buffer, fmt,
2437 va_arg(*vargs, Py_ssize_t));
2438 else
2439 len = sprintf(buffer, fmt,
2440 va_arg(*vargs, int));
2441 }
2442 assert(len >= 0);
2443
Victor Stinnere215d962012-10-06 23:03:36 +02002444 if (precision < len)
2445 precision = len;
2446 if (width > precision) {
2447 Py_UCS4 fillchar;
2448 fill = width - precision;
2449 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002450 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2451 return NULL;
2452 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2453 return NULL;
2454 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002455 }
Victor Stinner15a11362012-10-06 23:48:20 +02002456 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002457 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002458 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2459 return NULL;
2460 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2461 return NULL;
2462 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002463 }
Victor Stinner15a11362012-10-06 23:48:20 +02002464 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002465 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002466 break;
2467 }
2468
2469 case 'p':
2470 {
2471 char number[MAX_LONG_LONG_CHARS];
2472
2473 len = sprintf(number, "%p", va_arg(*vargs, void*));
2474 assert(len >= 0);
2475
2476 /* %p is ill-defined: ensure leading 0x. */
2477 if (number[1] == 'X')
2478 number[1] = 'x';
2479 else if (number[1] != 'x') {
2480 memmove(number + 2, number,
2481 strlen(number) + 1);
2482 number[0] = '0';
2483 number[1] = 'x';
2484 len += 2;
2485 }
2486
2487 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2488 return NULL;
2489 break;
2490 }
2491
2492 case 's':
2493 {
2494 /* UTF-8 */
2495 const char *s = va_arg(*vargs, const char*);
2496 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2497 if (!str)
2498 return NULL;
2499 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2500 Py_DECREF(str);
2501 return NULL;
2502 }
2503 Py_DECREF(str);
2504 break;
2505 }
2506
2507 case 'U':
2508 {
2509 PyObject *obj = va_arg(*vargs, PyObject *);
2510 assert(obj && _PyUnicode_CHECK(obj));
2511
2512 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2513 return NULL;
2514 break;
2515 }
2516
2517 case 'V':
2518 {
2519 PyObject *obj = va_arg(*vargs, PyObject *);
2520 const char *str = va_arg(*vargs, const char *);
2521 PyObject *str_obj;
2522 assert(obj || str);
2523 if (obj) {
2524 assert(_PyUnicode_CHECK(obj));
2525 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2526 return NULL;
2527 }
2528 else {
2529 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2530 if (!str_obj)
2531 return NULL;
2532 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2533 Py_DECREF(str_obj);
2534 return NULL;
2535 }
2536 Py_DECREF(str_obj);
2537 }
2538 break;
2539 }
2540
2541 case 'S':
2542 {
2543 PyObject *obj = va_arg(*vargs, PyObject *);
2544 PyObject *str;
2545 assert(obj);
2546 str = PyObject_Str(obj);
2547 if (!str)
2548 return NULL;
2549 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2550 Py_DECREF(str);
2551 return NULL;
2552 }
2553 Py_DECREF(str);
2554 break;
2555 }
2556
2557 case 'R':
2558 {
2559 PyObject *obj = va_arg(*vargs, PyObject *);
2560 PyObject *repr;
2561 assert(obj);
2562 repr = PyObject_Repr(obj);
2563 if (!repr)
2564 return NULL;
2565 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2566 Py_DECREF(repr);
2567 return NULL;
2568 }
2569 Py_DECREF(repr);
2570 break;
2571 }
2572
2573 case 'A':
2574 {
2575 PyObject *obj = va_arg(*vargs, PyObject *);
2576 PyObject *ascii;
2577 assert(obj);
2578 ascii = PyObject_ASCII(obj);
2579 if (!ascii)
2580 return NULL;
2581 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2582 Py_DECREF(ascii);
2583 return NULL;
2584 }
2585 Py_DECREF(ascii);
2586 break;
2587 }
2588
2589 case '%':
2590 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2591 return NULL;
2592 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2593 writer->pos++;
2594 break;
2595
2596 default:
2597 /* if we stumble upon an unknown formatting code, copy the rest
2598 of the format string to the output string. (we cannot just
2599 skip the code, since there's no way to know what's in the
2600 argument list) */
2601 len = strlen(p);
2602 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2603 return NULL;
2604 f = p+len;
2605 return f;
2606 }
2607
2608 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002609 return f;
2610}
2611
Walter Dörwaldd2034312007-05-18 16:29:38 +00002612PyObject *
2613PyUnicode_FromFormatV(const char *format, va_list vargs)
2614{
Victor Stinnere215d962012-10-06 23:03:36 +02002615 va_list vargs2;
2616 const char *f;
2617 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002618
Victor Stinnere215d962012-10-06 23:03:36 +02002619 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2620
2621 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2622 Copy it to be able to pass a reference to a subfunction. */
2623 Py_VA_COPY(vargs2, vargs);
2624
2625 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002627 f = unicode_fromformat_arg(&writer, f, &vargs2);
2628 if (f == NULL)
2629 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002632 const char *p;
2633 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002634
Victor Stinnere215d962012-10-06 23:03:36 +02002635 p = f;
2636 do
2637 {
2638 if ((unsigned char)*p > 127) {
2639 PyErr_Format(PyExc_ValueError,
2640 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2641 "string, got a non-ASCII byte: 0x%02x",
2642 (unsigned char)*p);
2643 return NULL;
2644 }
2645 p++;
2646 }
2647 while (*p != '\0' && *p != '%');
2648 len = p - f;
2649
2650 if (*p == '\0')
2651 writer.overallocate = 0;
2652 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2653 goto fail;
2654 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2655 writer.pos += len;
2656
2657 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002658 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 }
Victor Stinnere215d962012-10-06 23:03:36 +02002660 return _PyUnicodeWriter_Finish(&writer);
2661
2662 fail:
2663 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002664 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002665}
2666
Walter Dörwaldd2034312007-05-18 16:29:38 +00002667PyObject *
2668PyUnicode_FromFormat(const char *format, ...)
2669{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 PyObject* ret;
2671 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002672
2673#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002675#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002676 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002677#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 ret = PyUnicode_FromFormatV(format, vargs);
2679 va_end(vargs);
2680 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002681}
2682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683#ifdef HAVE_WCHAR_H
2684
Victor Stinner5593d8a2010-10-02 11:11:27 +00002685/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2686 convert a Unicode object to a wide character string.
2687
Victor Stinnerd88d9832011-09-06 02:00:05 +02002688 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002689 character) required to convert the unicode object. Ignore size argument.
2690
Victor Stinnerd88d9832011-09-06 02:00:05 +02002691 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002692 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002693 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002694static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002695unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002696 wchar_t *w,
2697 Py_ssize_t size)
2698{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002699 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 const wchar_t *wstr;
2701
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002702 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002703 if (wstr == NULL)
2704 return -1;
2705
Victor Stinner5593d8a2010-10-02 11:11:27 +00002706 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002707 if (size > res)
2708 size = res + 1;
2709 else
2710 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002712 return res;
2713 }
2714 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002716}
2717
2718Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002719PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002720 wchar_t *w,
2721 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722{
2723 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002724 PyErr_BadInternalCall();
2725 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002727 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728}
2729
Victor Stinner137c34c2010-09-29 10:25:54 +00002730wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002731PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002732 Py_ssize_t *size)
2733{
2734 wchar_t* buffer;
2735 Py_ssize_t buflen;
2736
2737 if (unicode == NULL) {
2738 PyErr_BadInternalCall();
2739 return NULL;
2740 }
2741
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002742 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 if (buflen == -1)
2744 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002745 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002746 PyErr_NoMemory();
2747 return NULL;
2748 }
2749
Victor Stinner137c34c2010-09-29 10:25:54 +00002750 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2751 if (buffer == NULL) {
2752 PyErr_NoMemory();
2753 return NULL;
2754 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002755 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002756 if (buflen == -1) {
2757 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002758 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002759 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002760 if (size != NULL)
2761 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002762 return buffer;
2763}
2764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002765#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766
Alexander Belopolsky40018472011-02-26 01:02:56 +00002767PyObject *
2768PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002771 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002772 PyErr_SetString(PyExc_ValueError,
2773 "chr() arg not in range(0x110000)");
2774 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002775 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002777 if (ordinal < 256)
2778 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 v = PyUnicode_New(1, ordinal);
2781 if (v == NULL)
2782 return NULL;
2783 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002784 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002785 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002786}
2787
Alexander Belopolsky40018472011-02-26 01:02:56 +00002788PyObject *
2789PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002791 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002792 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002793 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002794 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002795 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002796 Py_INCREF(obj);
2797 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002798 }
2799 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002800 /* For a Unicode subtype that's not a Unicode object,
2801 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002802 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002803 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002804 PyErr_Format(PyExc_TypeError,
2805 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002806 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002807 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002808}
2809
Alexander Belopolsky40018472011-02-26 01:02:56 +00002810PyObject *
2811PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002812 const char *encoding,
2813 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002814{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002815 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002816 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002817
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002819 PyErr_BadInternalCall();
2820 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002822
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002823 /* Decoding bytes objects is the most common case and should be fast */
2824 if (PyBytes_Check(obj)) {
2825 if (PyBytes_GET_SIZE(obj) == 0) {
2826 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002827 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002828 }
2829 else {
2830 v = PyUnicode_Decode(
2831 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2832 encoding, errors);
2833 }
2834 return v;
2835 }
2836
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002837 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002838 PyErr_SetString(PyExc_TypeError,
2839 "decoding str is not supported");
2840 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002841 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002842
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002843 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2844 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2845 PyErr_Format(PyExc_TypeError,
2846 "coercing to str: need bytes, bytearray "
2847 "or buffer-like object, %.80s found",
2848 Py_TYPE(obj)->tp_name);
2849 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002850 }
Tim Petersced69f82003-09-16 20:30:58 +00002851
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002852 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002854 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 }
Tim Petersced69f82003-09-16 20:30:58 +00002856 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002857 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002858
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002859 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002860 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861}
2862
Victor Stinner600d3be2010-06-10 12:00:55 +00002863/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002864 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2865 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002866int
2867_Py_normalize_encoding(const char *encoding,
2868 char *lower,
2869 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002871 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002872 char *l;
2873 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002874
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002875 if (encoding == NULL) {
2876 strcpy(lower, "utf-8");
2877 return 1;
2878 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002879 e = encoding;
2880 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002881 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002882 while (*e) {
2883 if (l == l_end)
2884 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002885 if (Py_ISUPPER(*e)) {
2886 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002887 }
2888 else if (*e == '_') {
2889 *l++ = '-';
2890 e++;
2891 }
2892 else {
2893 *l++ = *e++;
2894 }
2895 }
2896 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002897 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002898}
2899
Alexander Belopolsky40018472011-02-26 01:02:56 +00002900PyObject *
2901PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002902 Py_ssize_t size,
2903 const char *encoding,
2904 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002905{
2906 PyObject *buffer = NULL, *unicode;
2907 Py_buffer info;
2908 char lower[11]; /* Enough for any encoding shortcut */
2909
Fred Drakee4315f52000-05-09 19:53:39 +00002910 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002911 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002912 if ((strcmp(lower, "utf-8") == 0) ||
2913 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002914 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002915 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002916 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002917 (strcmp(lower, "iso-8859-1") == 0))
2918 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002919#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002920 else if (strcmp(lower, "mbcs") == 0)
2921 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002922#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002923 else if (strcmp(lower, "ascii") == 0)
2924 return PyUnicode_DecodeASCII(s, size, errors);
2925 else if (strcmp(lower, "utf-16") == 0)
2926 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2927 else if (strcmp(lower, "utf-32") == 0)
2928 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930
2931 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002932 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002933 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002934 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002935 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 if (buffer == NULL)
2937 goto onError;
2938 unicode = PyCodec_Decode(buffer, encoding, errors);
2939 if (unicode == NULL)
2940 goto onError;
2941 if (!PyUnicode_Check(unicode)) {
2942 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002943 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002944 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945 Py_DECREF(unicode);
2946 goto onError;
2947 }
2948 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002949 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002950
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952 Py_XDECREF(buffer);
2953 return NULL;
2954}
2955
Alexander Belopolsky40018472011-02-26 01:02:56 +00002956PyObject *
2957PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002958 const char *encoding,
2959 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002960{
2961 PyObject *v;
2962
2963 if (!PyUnicode_Check(unicode)) {
2964 PyErr_BadArgument();
2965 goto onError;
2966 }
2967
2968 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002969 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002970
2971 /* Decode via the codec registry */
2972 v = PyCodec_Decode(unicode, encoding, errors);
2973 if (v == NULL)
2974 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002975 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002976
Benjamin Peterson29060642009-01-31 22:14:21 +00002977 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002978 return NULL;
2979}
2980
Alexander Belopolsky40018472011-02-26 01:02:56 +00002981PyObject *
2982PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002983 const char *encoding,
2984 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002985{
2986 PyObject *v;
2987
2988 if (!PyUnicode_Check(unicode)) {
2989 PyErr_BadArgument();
2990 goto onError;
2991 }
2992
2993 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002994 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002995
2996 /* Decode via the codec registry */
2997 v = PyCodec_Decode(unicode, encoding, errors);
2998 if (v == NULL)
2999 goto onError;
3000 if (!PyUnicode_Check(v)) {
3001 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003002 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003003 Py_TYPE(v)->tp_name);
3004 Py_DECREF(v);
3005 goto onError;
3006 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003007 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003008
Benjamin Peterson29060642009-01-31 22:14:21 +00003009 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003010 return NULL;
3011}
3012
Alexander Belopolsky40018472011-02-26 01:02:56 +00003013PyObject *
3014PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003015 Py_ssize_t size,
3016 const char *encoding,
3017 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018{
3019 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003020
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021 unicode = PyUnicode_FromUnicode(s, size);
3022 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3025 Py_DECREF(unicode);
3026 return v;
3027}
3028
Alexander Belopolsky40018472011-02-26 01:02:56 +00003029PyObject *
3030PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003031 const char *encoding,
3032 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003033{
3034 PyObject *v;
3035
3036 if (!PyUnicode_Check(unicode)) {
3037 PyErr_BadArgument();
3038 goto onError;
3039 }
3040
3041 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003043
3044 /* Encode via the codec registry */
3045 v = PyCodec_Encode(unicode, encoding, errors);
3046 if (v == NULL)
3047 goto onError;
3048 return v;
3049
Benjamin Peterson29060642009-01-31 22:14:21 +00003050 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003051 return NULL;
3052}
3053
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003054static size_t
3055wcstombs_errorpos(const wchar_t *wstr)
3056{
3057 size_t len;
3058#if SIZEOF_WCHAR_T == 2
3059 wchar_t buf[3];
3060#else
3061 wchar_t buf[2];
3062#endif
3063 char outbuf[MB_LEN_MAX];
3064 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003065
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003066#if SIZEOF_WCHAR_T == 2
3067 buf[2] = 0;
3068#else
3069 buf[1] = 0;
3070#endif
3071 start = wstr;
3072 while (*wstr != L'\0')
3073 {
3074 previous = wstr;
3075#if SIZEOF_WCHAR_T == 2
3076 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3077 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3078 {
3079 buf[0] = wstr[0];
3080 buf[1] = wstr[1];
3081 wstr += 2;
3082 }
3083 else {
3084 buf[0] = *wstr;
3085 buf[1] = 0;
3086 wstr++;
3087 }
3088#else
3089 buf[0] = *wstr;
3090 wstr++;
3091#endif
3092 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003093 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003094 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003095 }
3096
3097 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003098 return 0;
3099}
3100
Victor Stinner1b579672011-12-17 05:47:23 +01003101static int
3102locale_error_handler(const char *errors, int *surrogateescape)
3103{
3104 if (errors == NULL) {
3105 *surrogateescape = 0;
3106 return 0;
3107 }
3108
3109 if (strcmp(errors, "strict") == 0) {
3110 *surrogateescape = 0;
3111 return 0;
3112 }
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003113 if (errors == "surrogateescape"
3114 || strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003115 *surrogateescape = 1;
3116 return 0;
3117 }
3118 PyErr_Format(PyExc_ValueError,
3119 "only 'strict' and 'surrogateescape' error handlers "
3120 "are supported, not '%s'",
3121 errors);
3122 return -1;
3123}
3124
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003125PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003126PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003127{
3128 Py_ssize_t wlen, wlen2;
3129 wchar_t *wstr;
3130 PyObject *bytes = NULL;
3131 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003132 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003133 PyObject *exc;
3134 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003135 int surrogateescape;
3136
3137 if (locale_error_handler(errors, &surrogateescape) < 0)
3138 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003139
3140 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3141 if (wstr == NULL)
3142 return NULL;
3143
3144 wlen2 = wcslen(wstr);
3145 if (wlen2 != wlen) {
3146 PyMem_Free(wstr);
3147 PyErr_SetString(PyExc_TypeError, "embedded null character");
3148 return NULL;
3149 }
3150
3151 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003152 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003153 char *str;
3154
3155 str = _Py_wchar2char(wstr, &error_pos);
3156 if (str == NULL) {
3157 if (error_pos == (size_t)-1) {
3158 PyErr_NoMemory();
3159 PyMem_Free(wstr);
3160 return NULL;
3161 }
3162 else {
3163 goto encode_error;
3164 }
3165 }
3166 PyMem_Free(wstr);
3167
3168 bytes = PyBytes_FromString(str);
3169 PyMem_Free(str);
3170 }
3171 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003172 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003173 size_t len, len2;
3174
3175 len = wcstombs(NULL, wstr, 0);
3176 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003177 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003178 goto encode_error;
3179 }
3180
3181 bytes = PyBytes_FromStringAndSize(NULL, len);
3182 if (bytes == NULL) {
3183 PyMem_Free(wstr);
3184 return NULL;
3185 }
3186
3187 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3188 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003189 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003190 goto encode_error;
3191 }
3192 PyMem_Free(wstr);
3193 }
3194 return bytes;
3195
3196encode_error:
3197 errmsg = strerror(errno);
3198 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003199
3200 if (error_pos == (size_t)-1)
3201 error_pos = wcstombs_errorpos(wstr);
3202
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003203 PyMem_Free(wstr);
3204 Py_XDECREF(bytes);
3205
Victor Stinner2f197072011-12-17 07:08:30 +01003206 if (errmsg != NULL) {
3207 size_t errlen;
3208 wstr = _Py_char2wchar(errmsg, &errlen);
3209 if (wstr != NULL) {
3210 reason = PyUnicode_FromWideChar(wstr, errlen);
3211 PyMem_Free(wstr);
3212 } else
3213 errmsg = NULL;
3214 }
3215 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003216 reason = PyUnicode_FromString(
3217 "wcstombs() encountered an unencodable "
3218 "wide character");
3219 if (reason == NULL)
3220 return NULL;
3221
3222 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3223 "locale", unicode,
3224 (Py_ssize_t)error_pos,
3225 (Py_ssize_t)(error_pos+1),
3226 reason);
3227 Py_DECREF(reason);
3228 if (exc != NULL) {
3229 PyCodec_StrictErrors(exc);
3230 Py_XDECREF(exc);
3231 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003232 return NULL;
3233}
3234
Victor Stinnerad158722010-10-27 00:25:46 +00003235PyObject *
3236PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003237{
Victor Stinner99b95382011-07-04 14:23:54 +02003238#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003239 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003240#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003241 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003242#else
Victor Stinner793b5312011-04-27 00:24:21 +02003243 PyInterpreterState *interp = PyThreadState_GET()->interp;
3244 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3245 cannot use it to encode and decode filenames before it is loaded. Load
3246 the Python codec requires to encode at least its own filename. Use the C
3247 version of the locale codec until the codec registry is initialized and
3248 the Python codec is loaded.
3249
3250 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3251 cannot only rely on it: check also interp->fscodec_initialized for
3252 subinterpreters. */
3253 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003254 return PyUnicode_AsEncodedString(unicode,
3255 Py_FileSystemDefaultEncoding,
3256 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003257 }
3258 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003259 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003260 }
Victor Stinnerad158722010-10-27 00:25:46 +00003261#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003262}
3263
Alexander Belopolsky40018472011-02-26 01:02:56 +00003264PyObject *
3265PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003266 const char *encoding,
3267 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268{
3269 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003270 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003271
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 if (!PyUnicode_Check(unicode)) {
3273 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003274 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 }
Fred Drakee4315f52000-05-09 19:53:39 +00003276
Fred Drakee4315f52000-05-09 19:53:39 +00003277 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003278 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003279 if ((strcmp(lower, "utf-8") == 0) ||
3280 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003281 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003282 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003283 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003284 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003285 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003286 }
Victor Stinner37296e82010-06-10 13:36:23 +00003287 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003288 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003289 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003290 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003291#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003292 else if (strcmp(lower, "mbcs") == 0)
3293 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003294#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003295 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003296 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003297 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298
3299 /* Encode via the codec registry */
3300 v = PyCodec_Encode(unicode, encoding, errors);
3301 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003302 return NULL;
3303
3304 /* The normal path */
3305 if (PyBytes_Check(v))
3306 return v;
3307
3308 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003309 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003310 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003311 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003312
3313 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3314 "encoder %s returned bytearray instead of bytes",
3315 encoding);
3316 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003317 Py_DECREF(v);
3318 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003319 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003320
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003321 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3322 Py_DECREF(v);
3323 return b;
3324 }
3325
3326 PyErr_Format(PyExc_TypeError,
3327 "encoder did not return a bytes object (type=%.400s)",
3328 Py_TYPE(v)->tp_name);
3329 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003330 return NULL;
3331}
3332
Alexander Belopolsky40018472011-02-26 01:02:56 +00003333PyObject *
3334PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003335 const char *encoding,
3336 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003337{
3338 PyObject *v;
3339
3340 if (!PyUnicode_Check(unicode)) {
3341 PyErr_BadArgument();
3342 goto onError;
3343 }
3344
3345 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003346 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003347
3348 /* Encode via the codec registry */
3349 v = PyCodec_Encode(unicode, encoding, errors);
3350 if (v == NULL)
3351 goto onError;
3352 if (!PyUnicode_Check(v)) {
3353 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003354 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003355 Py_TYPE(v)->tp_name);
3356 Py_DECREF(v);
3357 goto onError;
3358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003360
Benjamin Peterson29060642009-01-31 22:14:21 +00003361 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 return NULL;
3363}
3364
Victor Stinner2f197072011-12-17 07:08:30 +01003365static size_t
3366mbstowcs_errorpos(const char *str, size_t len)
3367{
3368#ifdef HAVE_MBRTOWC
3369 const char *start = str;
3370 mbstate_t mbs;
3371 size_t converted;
3372 wchar_t ch;
3373
3374 memset(&mbs, 0, sizeof mbs);
3375 while (len)
3376 {
3377 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3378 if (converted == 0)
3379 /* Reached end of string */
3380 break;
3381 if (converted == (size_t)-1 || converted == (size_t)-2) {
3382 /* Conversion error or incomplete character */
3383 return str - start;
3384 }
3385 else {
3386 str += converted;
3387 len -= converted;
3388 }
3389 }
3390 /* failed to find the undecodable byte sequence */
3391 return 0;
3392#endif
3393 return 0;
3394}
3395
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003396PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003397PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003398 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003399{
3400 wchar_t smallbuf[256];
3401 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3402 wchar_t *wstr;
3403 size_t wlen, wlen2;
3404 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003405 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003406 size_t error_pos;
3407 char *errmsg;
3408 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003409
3410 if (locale_error_handler(errors, &surrogateescape) < 0)
3411 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003412
3413 if (str[len] != '\0' || len != strlen(str)) {
3414 PyErr_SetString(PyExc_TypeError, "embedded null character");
3415 return NULL;
3416 }
3417
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003418 if (surrogateescape) {
3419 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003420 wstr = _Py_char2wchar(str, &wlen);
3421 if (wstr == NULL) {
3422 if (wlen == (size_t)-1)
3423 PyErr_NoMemory();
3424 else
3425 PyErr_SetFromErrno(PyExc_OSError);
3426 return NULL;
3427 }
3428
3429 unicode = PyUnicode_FromWideChar(wstr, wlen);
3430 PyMem_Free(wstr);
3431 }
3432 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003433 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003434#ifndef HAVE_BROKEN_MBSTOWCS
3435 wlen = mbstowcs(NULL, str, 0);
3436#else
3437 wlen = len;
3438#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003439 if (wlen == (size_t)-1)
3440 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003441 if (wlen+1 <= smallbuf_len) {
3442 wstr = smallbuf;
3443 }
3444 else {
3445 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3446 return PyErr_NoMemory();
3447
3448 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3449 if (!wstr)
3450 return PyErr_NoMemory();
3451 }
3452
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003453 wlen2 = mbstowcs(wstr, str, wlen+1);
3454 if (wlen2 == (size_t)-1) {
3455 if (wstr != smallbuf)
3456 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003457 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003458 }
3459#ifdef HAVE_BROKEN_MBSTOWCS
3460 assert(wlen2 == wlen);
3461#endif
3462 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3463 if (wstr != smallbuf)
3464 PyMem_Free(wstr);
3465 }
3466 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003467
3468decode_error:
3469 errmsg = strerror(errno);
3470 assert(errmsg != NULL);
3471
3472 error_pos = mbstowcs_errorpos(str, len);
3473 if (errmsg != NULL) {
3474 size_t errlen;
3475 wstr = _Py_char2wchar(errmsg, &errlen);
3476 if (wstr != NULL) {
3477 reason = PyUnicode_FromWideChar(wstr, errlen);
3478 PyMem_Free(wstr);
3479 } else
3480 errmsg = NULL;
3481 }
3482 if (errmsg == NULL)
3483 reason = PyUnicode_FromString(
3484 "mbstowcs() encountered an invalid multibyte sequence");
3485 if (reason == NULL)
3486 return NULL;
3487
3488 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3489 "locale", str, len,
3490 (Py_ssize_t)error_pos,
3491 (Py_ssize_t)(error_pos+1),
3492 reason);
3493 Py_DECREF(reason);
3494 if (exc != NULL) {
3495 PyCodec_StrictErrors(exc);
3496 Py_XDECREF(exc);
3497 }
3498 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003499}
3500
3501PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003502PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003503{
3504 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003505 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003506}
3507
3508
3509PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003510PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003511 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003512 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3513}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003514
Christian Heimes5894ba72007-11-04 11:43:14 +00003515PyObject*
3516PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3517{
Victor Stinner99b95382011-07-04 14:23:54 +02003518#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003519 return PyUnicode_DecodeMBCS(s, size, NULL);
3520#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003521 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003522#else
Victor Stinner793b5312011-04-27 00:24:21 +02003523 PyInterpreterState *interp = PyThreadState_GET()->interp;
3524 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3525 cannot use it to encode and decode filenames before it is loaded. Load
3526 the Python codec requires to encode at least its own filename. Use the C
3527 version of the locale codec until the codec registry is initialized and
3528 the Python codec is loaded.
3529
3530 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3531 cannot only rely on it: check also interp->fscodec_initialized for
3532 subinterpreters. */
3533 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003534 return PyUnicode_Decode(s, size,
3535 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003536 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003537 }
3538 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003539 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003540 }
Victor Stinnerad158722010-10-27 00:25:46 +00003541#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003542}
3543
Martin v. Löwis011e8422009-05-05 04:43:17 +00003544
3545int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003546_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003547{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003548 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003549
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003550 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003551 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003552 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3553 PyUnicode_GET_LENGTH(str), '\0', 1);
3554 if (pos == -1)
3555 return 0;
3556 else
3557 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003558}
3559
Antoine Pitrou13348842012-01-29 18:36:34 +01003560int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003561PyUnicode_FSConverter(PyObject* arg, void* addr)
3562{
3563 PyObject *output = NULL;
3564 Py_ssize_t size;
3565 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003566 if (arg == NULL) {
3567 Py_DECREF(*(PyObject**)addr);
3568 return 1;
3569 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003570 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003571 output = arg;
3572 Py_INCREF(output);
3573 }
3574 else {
3575 arg = PyUnicode_FromObject(arg);
3576 if (!arg)
3577 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003578 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003579 Py_DECREF(arg);
3580 if (!output)
3581 return 0;
3582 if (!PyBytes_Check(output)) {
3583 Py_DECREF(output);
3584 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3585 return 0;
3586 }
3587 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003588 size = PyBytes_GET_SIZE(output);
3589 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003590 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003591 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003592 Py_DECREF(output);
3593 return 0;
3594 }
3595 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003596 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003597}
3598
3599
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003600int
3601PyUnicode_FSDecoder(PyObject* arg, void* addr)
3602{
3603 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003604 if (arg == NULL) {
3605 Py_DECREF(*(PyObject**)addr);
3606 return 1;
3607 }
3608 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003609 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003610 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003611 output = arg;
3612 Py_INCREF(output);
3613 }
3614 else {
3615 arg = PyBytes_FromObject(arg);
3616 if (!arg)
3617 return 0;
3618 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3619 PyBytes_GET_SIZE(arg));
3620 Py_DECREF(arg);
3621 if (!output)
3622 return 0;
3623 if (!PyUnicode_Check(output)) {
3624 Py_DECREF(output);
3625 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3626 return 0;
3627 }
3628 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003629 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003630 Py_DECREF(output);
3631 return 0;
3632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003633 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003634 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003635 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3636 Py_DECREF(output);
3637 return 0;
3638 }
3639 *(PyObject**)addr = output;
3640 return Py_CLEANUP_SUPPORTED;
3641}
3642
3643
Martin v. Löwis5b222132007-06-10 09:51:05 +00003644char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003645PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003646{
Christian Heimesf3863112007-11-22 07:46:41 +00003647 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003648
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003649 if (!PyUnicode_Check(unicode)) {
3650 PyErr_BadArgument();
3651 return NULL;
3652 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003653 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003654 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003655
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003656 if (PyUnicode_UTF8(unicode) == NULL) {
3657 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003658 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3659 if (bytes == NULL)
3660 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003661 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3662 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003663 Py_DECREF(bytes);
3664 return NULL;
3665 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003666 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3667 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3668 PyBytes_AS_STRING(bytes),
3669 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003670 Py_DECREF(bytes);
3671 }
3672
3673 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003674 *psize = PyUnicode_UTF8_LENGTH(unicode);
3675 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003676}
3677
3678char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003679PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003680{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003681 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3682}
3683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003684Py_UNICODE *
3685PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003687 const unsigned char *one_byte;
3688#if SIZEOF_WCHAR_T == 4
3689 const Py_UCS2 *two_bytes;
3690#else
3691 const Py_UCS4 *four_bytes;
3692 const Py_UCS4 *ucs4_end;
3693 Py_ssize_t num_surrogates;
3694#endif
3695 wchar_t *w;
3696 wchar_t *wchar_end;
3697
3698 if (!PyUnicode_Check(unicode)) {
3699 PyErr_BadArgument();
3700 return NULL;
3701 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003702 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003703 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003704 assert(_PyUnicode_KIND(unicode) != 0);
3705 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003706
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003707 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003708#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003709 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3710 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711 num_surrogates = 0;
3712
3713 for (; four_bytes < ucs4_end; ++four_bytes) {
3714 if (*four_bytes > 0xFFFF)
3715 ++num_surrogates;
3716 }
3717
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003718 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3719 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3720 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003721 PyErr_NoMemory();
3722 return NULL;
3723 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003724 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003725
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003726 w = _PyUnicode_WSTR(unicode);
3727 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3728 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3730 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003731 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003732 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003733 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3734 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003735 }
3736 else
3737 *w = *four_bytes;
3738
3739 if (w > wchar_end) {
3740 assert(0 && "Miscalculated string end");
3741 }
3742 }
3743 *w = 0;
3744#else
3745 /* sizeof(wchar_t) == 4 */
3746 Py_FatalError("Impossible unicode object state, wstr and str "
3747 "should share memory already.");
3748 return NULL;
3749#endif
3750 }
3751 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003752 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3753 (_PyUnicode_LENGTH(unicode) + 1));
3754 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755 PyErr_NoMemory();
3756 return NULL;
3757 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003758 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3759 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3760 w = _PyUnicode_WSTR(unicode);
3761 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003763 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3764 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 for (; w < wchar_end; ++one_byte, ++w)
3766 *w = *one_byte;
3767 /* null-terminate the wstr */
3768 *w = 0;
3769 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003770 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003772 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773 for (; w < wchar_end; ++two_bytes, ++w)
3774 *w = *two_bytes;
3775 /* null-terminate the wstr */
3776 *w = 0;
3777#else
3778 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003779 PyObject_FREE(_PyUnicode_WSTR(unicode));
3780 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 Py_FatalError("Impossible unicode object state, wstr "
3782 "and str should share memory already.");
3783 return NULL;
3784#endif
3785 }
3786 else {
3787 assert(0 && "This should never happen.");
3788 }
3789 }
3790 }
3791 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003792 *size = PyUnicode_WSTR_LENGTH(unicode);
3793 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003794}
3795
Alexander Belopolsky40018472011-02-26 01:02:56 +00003796Py_UNICODE *
3797PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003799 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800}
3801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003802
Alexander Belopolsky40018472011-02-26 01:02:56 +00003803Py_ssize_t
3804PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805{
3806 if (!PyUnicode_Check(unicode)) {
3807 PyErr_BadArgument();
3808 goto onError;
3809 }
3810 return PyUnicode_GET_SIZE(unicode);
3811
Benjamin Peterson29060642009-01-31 22:14:21 +00003812 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 return -1;
3814}
3815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816Py_ssize_t
3817PyUnicode_GetLength(PyObject *unicode)
3818{
Victor Stinner07621332012-06-16 04:53:46 +02003819 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820 PyErr_BadArgument();
3821 return -1;
3822 }
Victor Stinner07621332012-06-16 04:53:46 +02003823 if (PyUnicode_READY(unicode) == -1)
3824 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003825 return PyUnicode_GET_LENGTH(unicode);
3826}
3827
3828Py_UCS4
3829PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3830{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003831 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3832 PyErr_BadArgument();
3833 return (Py_UCS4)-1;
3834 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003835 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003836 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837 return (Py_UCS4)-1;
3838 }
3839 return PyUnicode_READ_CHAR(unicode, index);
3840}
3841
3842int
3843PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3844{
3845 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003846 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847 return -1;
3848 }
Victor Stinner488fa492011-12-12 00:01:39 +01003849 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003850 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003851 PyErr_SetString(PyExc_IndexError, "string index out of range");
3852 return -1;
3853 }
Victor Stinner488fa492011-12-12 00:01:39 +01003854 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003855 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003856 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3857 PyErr_SetString(PyExc_ValueError, "character out of range");
3858 return -1;
3859 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3861 index, ch);
3862 return 0;
3863}
3864
Alexander Belopolsky40018472011-02-26 01:02:56 +00003865const char *
3866PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003867{
Victor Stinner42cb4622010-09-01 19:39:01 +00003868 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003869}
3870
Victor Stinner554f3f02010-06-16 23:33:54 +00003871/* create or adjust a UnicodeDecodeError */
3872static void
3873make_decode_exception(PyObject **exceptionObject,
3874 const char *encoding,
3875 const char *input, Py_ssize_t length,
3876 Py_ssize_t startpos, Py_ssize_t endpos,
3877 const char *reason)
3878{
3879 if (*exceptionObject == NULL) {
3880 *exceptionObject = PyUnicodeDecodeError_Create(
3881 encoding, input, length, startpos, endpos, reason);
3882 }
3883 else {
3884 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3885 goto onError;
3886 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3887 goto onError;
3888 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3889 goto onError;
3890 }
3891 return;
3892
3893onError:
3894 Py_DECREF(*exceptionObject);
3895 *exceptionObject = NULL;
3896}
3897
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003898#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899/* error handling callback helper:
3900 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003901 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003902 and adjust various state variables.
3903 return 0 on success, -1 on error
3904*/
3905
Alexander Belopolsky40018472011-02-26 01:02:56 +00003906static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003907unicode_decode_call_errorhandler_wchar(
3908 const char *errors, PyObject **errorHandler,
3909 const char *encoding, const char *reason,
3910 const char **input, const char **inend, Py_ssize_t *startinpos,
3911 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3912 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003914 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915
3916 PyObject *restuple = NULL;
3917 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003918 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003919 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003920 Py_ssize_t requiredsize;
3921 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003922 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003923 wchar_t *repwstr;
3924 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003926 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3927 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003928
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003930 *errorHandler = PyCodec_LookupError(errors);
3931 if (*errorHandler == NULL)
3932 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 }
3934
Victor Stinner554f3f02010-06-16 23:33:54 +00003935 make_decode_exception(exceptionObject,
3936 encoding,
3937 *input, *inend - *input,
3938 *startinpos, *endinpos,
3939 reason);
3940 if (*exceptionObject == NULL)
3941 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003942
3943 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3944 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003945 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003947 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003948 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949 }
3950 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003951 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003952
3953 /* Copy back the bytes variables, which might have been modified by the
3954 callback */
3955 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3956 if (!inputobj)
3957 goto onError;
3958 if (!PyBytes_Check(inputobj)) {
3959 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3960 }
3961 *input = PyBytes_AS_STRING(inputobj);
3962 insize = PyBytes_GET_SIZE(inputobj);
3963 *inend = *input + insize;
3964 /* we can DECREF safely, as the exception has another reference,
3965 so the object won't go away. */
3966 Py_DECREF(inputobj);
3967
3968 if (newpos<0)
3969 newpos = insize+newpos;
3970 if (newpos<0 || newpos>insize) {
3971 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3972 goto onError;
3973 }
3974
3975 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3976 if (repwstr == NULL)
3977 goto onError;
3978 /* need more space? (at least enough for what we
3979 have+the replacement+the rest of the string (starting
3980 at the new input position), so we won't have to check space
3981 when there are no errors in the rest of the string) */
3982 requiredsize = *outpos + repwlen + insize-newpos;
3983 if (requiredsize > outsize) {
3984 if (requiredsize < 2*outsize)
3985 requiredsize = 2*outsize;
3986 if (unicode_resize(output, requiredsize) < 0)
3987 goto onError;
3988 }
3989 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3990 *outpos += repwlen;
3991
3992 *endinpos = newpos;
3993 *inptr = *input + newpos;
3994
3995 /* we made it! */
3996 Py_XDECREF(restuple);
3997 return 0;
3998
3999 onError:
4000 Py_XDECREF(restuple);
4001 return -1;
4002}
4003#endif /* HAVE_MBCS */
4004
4005static int
4006unicode_decode_call_errorhandler_writer(
4007 const char *errors, PyObject **errorHandler,
4008 const char *encoding, const char *reason,
4009 const char **input, const char **inend, Py_ssize_t *startinpos,
4010 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4011 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4012{
4013 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4014
4015 PyObject *restuple = NULL;
4016 PyObject *repunicode = NULL;
4017 Py_ssize_t insize;
4018 Py_ssize_t newpos;
4019 PyObject *inputobj = NULL;
4020
4021 if (*errorHandler == NULL) {
4022 *errorHandler = PyCodec_LookupError(errors);
4023 if (*errorHandler == NULL)
4024 goto onError;
4025 }
4026
4027 make_decode_exception(exceptionObject,
4028 encoding,
4029 *input, *inend - *input,
4030 *startinpos, *endinpos,
4031 reason);
4032 if (*exceptionObject == NULL)
4033 goto onError;
4034
4035 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4036 if (restuple == NULL)
4037 goto onError;
4038 if (!PyTuple_Check(restuple)) {
4039 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4040 goto onError;
4041 }
4042 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004043 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004044
4045 /* Copy back the bytes variables, which might have been modified by the
4046 callback */
4047 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4048 if (!inputobj)
4049 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004050 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004051 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004052 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004053 *input = PyBytes_AS_STRING(inputobj);
4054 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004055 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004056 /* we can DECREF safely, as the exception has another reference,
4057 so the object won't go away. */
4058 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004059
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004061 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004062 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4064 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004065 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004067 writer->overallocate = 1;
4068 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4069 return
4070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004072 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004073
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004075 Py_XDECREF(restuple);
4076 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077
Benjamin Peterson29060642009-01-31 22:14:21 +00004078 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004080 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081}
4082
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004083/* --- UTF-7 Codec -------------------------------------------------------- */
4084
Antoine Pitrou244651a2009-05-04 18:56:13 +00004085/* See RFC2152 for details. We encode conservatively and decode liberally. */
4086
4087/* Three simple macros defining base-64. */
4088
4089/* Is c a base-64 character? */
4090
4091#define IS_BASE64(c) \
4092 (((c) >= 'A' && (c) <= 'Z') || \
4093 ((c) >= 'a' && (c) <= 'z') || \
4094 ((c) >= '0' && (c) <= '9') || \
4095 (c) == '+' || (c) == '/')
4096
4097/* given that c is a base-64 character, what is its base-64 value? */
4098
4099#define FROM_BASE64(c) \
4100 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4101 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4102 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4103 (c) == '+' ? 62 : 63)
4104
4105/* What is the base-64 character of the bottom 6 bits of n? */
4106
4107#define TO_BASE64(n) \
4108 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4109
4110/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4111 * decoded as itself. We are permissive on decoding; the only ASCII
4112 * byte not decoding to itself is the + which begins a base64
4113 * string. */
4114
4115#define DECODE_DIRECT(c) \
4116 ((c) <= 127 && (c) != '+')
4117
4118/* The UTF-7 encoder treats ASCII characters differently according to
4119 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4120 * the above). See RFC2152. This array identifies these different
4121 * sets:
4122 * 0 : "Set D"
4123 * alphanumeric and '(),-./:?
4124 * 1 : "Set O"
4125 * !"#$%&*;<=>@[]^_`{|}
4126 * 2 : "whitespace"
4127 * ht nl cr sp
4128 * 3 : special (must be base64 encoded)
4129 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4130 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004131
Tim Petersced69f82003-09-16 20:30:58 +00004132static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004133char utf7_category[128] = {
4134/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4135 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4136/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4137 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4138/* sp ! " # $ % & ' ( ) * + , - . / */
4139 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4140/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4142/* @ A B C D E F G H I J K L M N O */
4143 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4144/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4146/* ` a b c d e f g h i j k l m n o */
4147 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4148/* p q r s t u v w x y z { | } ~ del */
4149 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004150};
4151
Antoine Pitrou244651a2009-05-04 18:56:13 +00004152/* ENCODE_DIRECT: this character should be encoded as itself. The
4153 * answer depends on whether we are encoding set O as itself, and also
4154 * on whether we are encoding whitespace as itself. RFC2152 makes it
4155 * clear that the answers to these questions vary between
4156 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004157
Antoine Pitrou244651a2009-05-04 18:56:13 +00004158#define ENCODE_DIRECT(c, directO, directWS) \
4159 ((c) < 128 && (c) > 0 && \
4160 ((utf7_category[(c)] == 0) || \
4161 (directWS && (utf7_category[(c)] == 2)) || \
4162 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004163
Alexander Belopolsky40018472011-02-26 01:02:56 +00004164PyObject *
4165PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004166 Py_ssize_t size,
4167 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004168{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004169 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4170}
4171
Antoine Pitrou244651a2009-05-04 18:56:13 +00004172/* The decoder. The only state we preserve is our read position,
4173 * i.e. how many characters we have consumed. So if we end in the
4174 * middle of a shift sequence we have to back off the read position
4175 * and the output to the beginning of the sequence, otherwise we lose
4176 * all the shift state (seen bits, number of bits seen, high
4177 * surrogate). */
4178
Alexander Belopolsky40018472011-02-26 01:02:56 +00004179PyObject *
4180PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004181 Py_ssize_t size,
4182 const char *errors,
4183 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004184{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004186 Py_ssize_t startinpos;
4187 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004188 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004189 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004190 const char *errmsg = "";
4191 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004192 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004193 unsigned int base64bits = 0;
4194 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004195 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 PyObject *errorHandler = NULL;
4197 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004198
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004199 if (size == 0) {
4200 if (consumed)
4201 *consumed = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004202 Py_INCREF(unicode_empty);
4203 return unicode_empty;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004204 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004205
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004206 /* Start off assuming it's all ASCII. Widen later as necessary. */
4207 _PyUnicodeWriter_Init(&writer, 0);
4208 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4209 goto onError;
4210
4211 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004212 e = s + size;
4213
4214 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004215 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004217 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004218
Antoine Pitrou244651a2009-05-04 18:56:13 +00004219 if (inShift) { /* in a base-64 section */
4220 if (IS_BASE64(ch)) { /* consume a base-64 character */
4221 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4222 base64bits += 6;
4223 s++;
4224 if (base64bits >= 16) {
4225 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004226 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004227 base64bits -= 16;
4228 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4229 if (surrogate) {
4230 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004231 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4232 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 if (_PyUnicodeWriter_Prepare(&writer, 1, ch2) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004234 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004235 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch2);
4236 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004237 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004238 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004239 }
4240 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004241 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004242 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004243 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4244 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004245 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004246 }
4247 }
Victor Stinner551ac952011-11-29 22:58:13 +01004248 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004249 /* first surrogate */
4250 surrogate = outCh;
4251 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004252 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004253 if (_PyUnicodeWriter_Prepare(&writer, 1, outCh) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004254 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004255 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, outCh);
4256 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004257 }
4258 }
4259 }
4260 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004261 inShift = 0;
4262 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004263 if (surrogate) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004264 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004265 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004266 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4267 writer.pos++;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004268 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004269 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004270 if (base64bits > 0) { /* left-over bits */
4271 if (base64bits >= 6) {
4272 /* We've seen at least one base-64 character */
4273 errmsg = "partial character in shift sequence";
4274 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004275 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004276 else {
4277 /* Some bits remain; they should be zero */
4278 if (base64buffer != 0) {
4279 errmsg = "non-zero padding bits in shift sequence";
4280 goto utf7Error;
4281 }
4282 }
4283 }
4284 if (ch != '-') {
4285 /* '-' is absorbed; other terminating
4286 characters are preserved */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004288 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004289 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4290 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004291 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004292 }
4293 }
4294 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004296 s++; /* consume '+' */
4297 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004298 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004299 if (_PyUnicodeWriter_Prepare(&writer, 1, '+') == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004300 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004301 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '+');
4302 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004303 }
4304 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004305 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004306 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004308 }
4309 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004310 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004311 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004312 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
4313 goto onError;
4314 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4315 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004316 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004317 else {
4318 startinpos = s-starts;
4319 s++;
4320 errmsg = "unexpected special character";
4321 goto utf7Error;
4322 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004325 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004327 errors, &errorHandler,
4328 "utf7", errmsg,
4329 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004331 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332 }
4333
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334 /* end of string */
4335
4336 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4337 /* if we're in an inconsistent state, that's an error */
4338 if (surrogate ||
4339 (base64bits >= 6) ||
4340 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004341 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004342 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 errors, &errorHandler,
4344 "utf7", "unterminated shift sequence",
4345 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004347 goto onError;
4348 if (s < e)
4349 goto restart;
4350 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352
4353 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004354 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004357 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 }
4359 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004360 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004362 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004363
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364 Py_XDECREF(errorHandler);
4365 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004366 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004367
Benjamin Peterson29060642009-01-31 22:14:21 +00004368 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369 Py_XDECREF(errorHandler);
4370 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004371 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004372 return NULL;
4373}
4374
4375
Alexander Belopolsky40018472011-02-26 01:02:56 +00004376PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004377_PyUnicode_EncodeUTF7(PyObject *str,
4378 int base64SetO,
4379 int base64WhiteSpace,
4380 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004381{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004382 int kind;
4383 void *data;
4384 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004385 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004386 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004387 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388 unsigned int base64bits = 0;
4389 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004390 char * out;
4391 char * start;
4392
Benjamin Petersonbac79492012-01-14 13:34:47 -05004393 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004394 return NULL;
4395 kind = PyUnicode_KIND(str);
4396 data = PyUnicode_DATA(str);
4397 len = PyUnicode_GET_LENGTH(str);
4398
4399 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004400 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004402 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004403 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004404 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004405 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004406 if (v == NULL)
4407 return NULL;
4408
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004409 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004410 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 if (inShift) {
4414 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4415 /* shifting out */
4416 if (base64bits) { /* output remaining bits */
4417 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4418 base64buffer = 0;
4419 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420 }
4421 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004422 /* Characters not in the BASE64 set implicitly unshift the sequence
4423 so no '-' is required, except if the character is itself a '-' */
4424 if (IS_BASE64(ch) || ch == '-') {
4425 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427 *out++ = (char) ch;
4428 }
4429 else {
4430 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004431 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 else { /* not in a shift sequence */
4434 if (ch == '+') {
4435 *out++ = '+';
4436 *out++ = '-';
4437 }
4438 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4439 *out++ = (char) ch;
4440 }
4441 else {
4442 *out++ = '+';
4443 inShift = 1;
4444 goto encode_char;
4445 }
4446 }
4447 continue;
4448encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004449 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004450 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004451
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 /* code first surrogate */
4453 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004454 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 while (base64bits >= 6) {
4456 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4457 base64bits -= 6;
4458 }
4459 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004460 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 base64bits += 16;
4463 base64buffer = (base64buffer << 16) | ch;
4464 while (base64bits >= 6) {
4465 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4466 base64bits -= 6;
4467 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004468 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469 if (base64bits)
4470 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4471 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004473 if (_PyBytes_Resize(&v, out - start) < 0)
4474 return NULL;
4475 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004477PyObject *
4478PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4479 Py_ssize_t size,
4480 int base64SetO,
4481 int base64WhiteSpace,
4482 const char *errors)
4483{
4484 PyObject *result;
4485 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4486 if (tmp == NULL)
4487 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004488 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004489 base64WhiteSpace, errors);
4490 Py_DECREF(tmp);
4491 return result;
4492}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493
Antoine Pitrou244651a2009-05-04 18:56:13 +00004494#undef IS_BASE64
4495#undef FROM_BASE64
4496#undef TO_BASE64
4497#undef DECODE_DIRECT
4498#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500/* --- UTF-8 Codec -------------------------------------------------------- */
4501
Alexander Belopolsky40018472011-02-26 01:02:56 +00004502PyObject *
4503PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004504 Py_ssize_t size,
4505 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506{
Walter Dörwald69652032004-09-07 20:24:22 +00004507 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4508}
4509
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004510#include "stringlib/asciilib.h"
4511#include "stringlib/codecs.h"
4512#include "stringlib/undef.h"
4513
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004514#include "stringlib/ucs1lib.h"
4515#include "stringlib/codecs.h"
4516#include "stringlib/undef.h"
4517
4518#include "stringlib/ucs2lib.h"
4519#include "stringlib/codecs.h"
4520#include "stringlib/undef.h"
4521
4522#include "stringlib/ucs4lib.h"
4523#include "stringlib/codecs.h"
4524#include "stringlib/undef.h"
4525
Antoine Pitrouab868312009-01-10 15:40:25 +00004526/* Mask to quickly check whether a C 'long' contains a
4527 non-ASCII, UTF8-encoded char. */
4528#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004529# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004530#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004531# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004532#else
4533# error C 'long' size should be either 4 or 8!
4534#endif
4535
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004536static Py_ssize_t
4537ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004538{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004539 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004540 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004541
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004542#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004543 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4544 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004545 /* Fast path, see in STRINGLIB(utf8_decode) for
4546 an explanation. */
4547 /* Help register allocation */
4548 register const char *_p = p;
4549 register Py_UCS1 * q = dest;
4550 while (_p < aligned_end) {
4551 unsigned long value = *(const unsigned long *) _p;
4552 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004554 *((unsigned long *)q) = value;
4555 _p += SIZEOF_LONG;
4556 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004557 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004558 p = _p;
4559 while (p < end) {
4560 if ((unsigned char)*p & 0x80)
4561 break;
4562 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004564 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004566#endif
4567 while (p < end) {
4568 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4569 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004570 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004571 /* Help register allocation */
4572 register const char *_p = p;
4573 while (_p < aligned_end) {
4574 unsigned long value = *(unsigned long *) _p;
4575 if (value & ASCII_CHAR_MASK)
4576 break;
4577 _p += SIZEOF_LONG;
4578 }
4579 p = _p;
4580 if (_p == end)
4581 break;
4582 }
4583 if ((unsigned char)*p & 0x80)
4584 break;
4585 ++p;
4586 }
4587 memcpy(dest, start, p - start);
4588 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589}
Antoine Pitrouab868312009-01-10 15:40:25 +00004590
Victor Stinner785938e2011-12-11 20:09:03 +01004591PyObject *
4592PyUnicode_DecodeUTF8Stateful(const char *s,
4593 Py_ssize_t size,
4594 const char *errors,
4595 Py_ssize_t *consumed)
4596{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004597 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004598 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004599 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004600
4601 Py_ssize_t startinpos;
4602 Py_ssize_t endinpos;
4603 const char *errmsg = "";
4604 PyObject *errorHandler = NULL;
4605 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004606
4607 if (size == 0) {
4608 if (consumed)
4609 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004610 Py_INCREF(unicode_empty);
4611 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004612 }
4613
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004614 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4615 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004616 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004617 *consumed = 1;
4618 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004619 }
4620
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004621 _PyUnicodeWriter_Init(&writer, 0);
4622 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4623 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004624
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004625 writer.pos = ascii_decode(s, end, writer.data);
4626 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004627 while (s < end) {
4628 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004629 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004630 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004631 if (PyUnicode_IS_ASCII(writer.buffer))
4632 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004633 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004634 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004635 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004636 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004637 } else {
4638 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004639 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004640 }
4641
4642 switch (ch) {
4643 case 0:
4644 if (s == end || consumed)
4645 goto End;
4646 errmsg = "unexpected end of data";
4647 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004648 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004649 break;
4650 case 1:
4651 errmsg = "invalid start byte";
4652 startinpos = s - starts;
4653 endinpos = startinpos + 1;
4654 break;
4655 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004656 case 3:
4657 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004658 errmsg = "invalid continuation byte";
4659 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004660 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004661 break;
4662 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004663 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004664 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004665 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4666 writer.pos++;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 continue;
4668 }
4669
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004670 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004671 errors, &errorHandler,
4672 "utf-8", errmsg,
4673 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004674 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004675 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004676 }
4677
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004678End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004679 if (consumed)
4680 *consumed = s - starts;
4681
4682 Py_XDECREF(errorHandler);
4683 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004684 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004685
4686onError:
4687 Py_XDECREF(errorHandler);
4688 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004689 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004690 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004691}
4692
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004693#ifdef __APPLE__
4694
4695/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004696 used to decode the command line arguments on Mac OS X.
4697
4698 Return a pointer to a newly allocated wide character string (use
4699 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004700
4701wchar_t*
4702_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4703{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004704 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004705 wchar_t *unicode;
4706 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004707
4708 /* Note: size will always be longer than the resulting Unicode
4709 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004710 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004711 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004712 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4713 if (!unicode)
4714 return NULL;
4715
4716 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004717 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004719 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004721#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004723#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004724 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004725#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004726 if (ch > 0xFF) {
4727#if SIZEOF_WCHAR_T == 4
4728 assert(0);
4729#else
4730 assert(Py_UNICODE_IS_SURROGATE(ch));
4731 /* compute and append the two surrogates: */
4732 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4733 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4734#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004735 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004736 else {
4737 if (!ch && s == e)
4738 break;
4739 /* surrogateescape */
4740 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4741 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004742 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004743 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004744 return unicode;
4745}
4746
4747#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004749/* Primary internal function which creates utf8 encoded bytes objects.
4750
4751 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004752 and allocate exactly as much space needed at the end. Else allocate the
4753 maximum possible needed (4 result bytes per Unicode character), and return
4754 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004755*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004756PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004757_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758{
Victor Stinner6099a032011-12-18 14:22:26 +01004759 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004760 void *data;
4761 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004763 if (!PyUnicode_Check(unicode)) {
4764 PyErr_BadArgument();
4765 return NULL;
4766 }
4767
4768 if (PyUnicode_READY(unicode) == -1)
4769 return NULL;
4770
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004771 if (PyUnicode_UTF8(unicode))
4772 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4773 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004774
4775 kind = PyUnicode_KIND(unicode);
4776 data = PyUnicode_DATA(unicode);
4777 size = PyUnicode_GET_LENGTH(unicode);
4778
Benjamin Petersonead6b532011-12-20 17:23:42 -06004779 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004780 default:
4781 assert(0);
4782 case PyUnicode_1BYTE_KIND:
4783 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4784 assert(!PyUnicode_IS_ASCII(unicode));
4785 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4786 case PyUnicode_2BYTE_KIND:
4787 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4788 case PyUnicode_4BYTE_KIND:
4789 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791}
4792
Alexander Belopolsky40018472011-02-26 01:02:56 +00004793PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004794PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4795 Py_ssize_t size,
4796 const char *errors)
4797{
4798 PyObject *v, *unicode;
4799
4800 unicode = PyUnicode_FromUnicode(s, size);
4801 if (unicode == NULL)
4802 return NULL;
4803 v = _PyUnicode_AsUTF8String(unicode, errors);
4804 Py_DECREF(unicode);
4805 return v;
4806}
4807
4808PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004809PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004811 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812}
4813
Walter Dörwald41980ca2007-08-16 21:55:45 +00004814/* --- UTF-32 Codec ------------------------------------------------------- */
4815
4816PyObject *
4817PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 Py_ssize_t size,
4819 const char *errors,
4820 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004821{
4822 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4823}
4824
4825PyObject *
4826PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004827 Py_ssize_t size,
4828 const char *errors,
4829 int *byteorder,
4830 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004831{
4832 const char *starts = s;
4833 Py_ssize_t startinpos;
4834 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004835 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004836 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004837 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004838 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004839 PyObject *errorHandler = NULL;
4840 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004841
Walter Dörwald41980ca2007-08-16 21:55:45 +00004842 q = (unsigned char *)s;
4843 e = q + size;
4844
4845 if (byteorder)
4846 bo = *byteorder;
4847
4848 /* Check for BOM marks (U+FEFF) in the input and adjust current
4849 byte order setting accordingly. In native mode, the leading BOM
4850 mark is skipped, in all other modes, it is copied to the output
4851 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004852 if (bo == 0 && size >= 4) {
4853 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4854 if (bom == 0x0000FEFF) {
4855 bo = -1;
4856 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004857 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004858 else if (bom == 0xFFFE0000) {
4859 bo = 1;
4860 q += 4;
4861 }
4862 if (byteorder)
4863 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004864 }
4865
Victor Stinnere64322e2012-10-30 23:12:47 +01004866 if (q == e) {
4867 if (consumed)
4868 *consumed = size;
4869 Py_INCREF(unicode_empty);
4870 return unicode_empty;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004871 }
4872
Victor Stinnere64322e2012-10-30 23:12:47 +01004873#ifdef WORDS_BIGENDIAN
4874 le = bo < 0;
4875#else
4876 le = bo <= 0;
4877#endif
4878
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004879 _PyUnicodeWriter_Init(&writer, 0);
4880 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
4881 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004882
Victor Stinnere64322e2012-10-30 23:12:47 +01004883 while (1) {
4884 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004885 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004886
Victor Stinnere64322e2012-10-30 23:12:47 +01004887 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004888 enum PyUnicode_Kind kind = writer.kind;
4889 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004890 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004891 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004892 if (le) {
4893 do {
4894 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4895 if (ch > maxch)
4896 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004897 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004898 q += 4;
4899 } while (q <= last);
4900 }
4901 else {
4902 do {
4903 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4904 if (ch > maxch)
4905 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004906 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004907 q += 4;
4908 } while (q <= last);
4909 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004910 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004911 }
4912
4913 if (ch <= maxch) {
4914 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004915 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01004916 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00004917 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01004918 startinpos = ((const char *)q) - starts;
4919 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00004920 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004921 else {
4922 if (ch < 0x110000) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004923 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinnere64322e2012-10-30 23:12:47 +01004924 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004925 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4926 writer.pos++;
Victor Stinnere64322e2012-10-30 23:12:47 +01004927 q += 4;
4928 continue;
4929 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004930 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01004931 startinpos = ((const char *)q) - starts;
4932 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004934
4935 /* The remaining input chars are ignored if the callback
4936 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004937 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004938 errors, &errorHandler,
4939 "utf32", errmsg,
4940 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004941 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004942 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004943 }
4944
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948 Py_XDECREF(errorHandler);
4949 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004950 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004951
Benjamin Peterson29060642009-01-31 22:14:21 +00004952 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004953 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004954 Py_XDECREF(errorHandler);
4955 Py_XDECREF(exc);
4956 return NULL;
4957}
4958
4959PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004960_PyUnicode_EncodeUTF32(PyObject *str,
4961 const char *errors,
4962 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004963{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004964 int kind;
4965 void *data;
4966 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004967 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004968 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004969 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004970 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02004971#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00004972 int iorder[] = {0, 1, 2, 3};
4973#else
4974 int iorder[] = {3, 2, 1, 0};
4975#endif
4976
Benjamin Peterson29060642009-01-31 22:14:21 +00004977#define STORECHAR(CH) \
4978 do { \
4979 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4980 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4981 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4982 p[iorder[0]] = (CH) & 0xff; \
4983 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 } while(0)
4985
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004986 if (!PyUnicode_Check(str)) {
4987 PyErr_BadArgument();
4988 return NULL;
4989 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004990 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004991 return NULL;
4992 kind = PyUnicode_KIND(str);
4993 data = PyUnicode_DATA(str);
4994 len = PyUnicode_GET_LENGTH(str);
4995
4996 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004997 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004999 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005000 if (v == NULL)
5001 return NULL;
5002
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005003 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005004 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005006 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005007 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008
5009 if (byteorder == -1) {
5010 /* force LE */
5011 iorder[0] = 0;
5012 iorder[1] = 1;
5013 iorder[2] = 2;
5014 iorder[3] = 3;
5015 }
5016 else if (byteorder == 1) {
5017 /* force BE */
5018 iorder[0] = 3;
5019 iorder[1] = 2;
5020 iorder[2] = 1;
5021 iorder[3] = 0;
5022 }
5023
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005024 for (i = 0; i < len; i++)
5025 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005026
5027 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005028 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005029#undef STORECHAR
5030}
5031
Alexander Belopolsky40018472011-02-26 01:02:56 +00005032PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005033PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5034 Py_ssize_t size,
5035 const char *errors,
5036 int byteorder)
5037{
5038 PyObject *result;
5039 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5040 if (tmp == NULL)
5041 return NULL;
5042 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5043 Py_DECREF(tmp);
5044 return result;
5045}
5046
5047PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005048PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005049{
Victor Stinnerb960b342011-11-20 19:12:52 +01005050 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051}
5052
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053/* --- UTF-16 Codec ------------------------------------------------------- */
5054
Tim Peters772747b2001-08-09 22:21:55 +00005055PyObject *
5056PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 Py_ssize_t size,
5058 const char *errors,
5059 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060{
Walter Dörwald69652032004-09-07 20:24:22 +00005061 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5062}
5063
5064PyObject *
5065PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 Py_ssize_t size,
5067 const char *errors,
5068 int *byteorder,
5069 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005070{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005071 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005072 Py_ssize_t startinpos;
5073 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005074 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005075 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005076 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005077 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005078 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079 PyObject *errorHandler = NULL;
5080 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081
Tim Peters772747b2001-08-09 22:21:55 +00005082 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005083 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084
5085 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005086 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005088 /* Check for BOM marks (U+FEFF) in the input and adjust current
5089 byte order setting accordingly. In native mode, the leading BOM
5090 mark is skipped, in all other modes, it is copied to the output
5091 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005092 if (bo == 0 && size >= 2) {
5093 const Py_UCS4 bom = (q[1] << 8) | q[0];
5094 if (bom == 0xFEFF) {
5095 q += 2;
5096 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005098 else if (bom == 0xFFFE) {
5099 q += 2;
5100 bo = 1;
5101 }
5102 if (byteorder)
5103 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005104 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105
Antoine Pitrou63065d72012-05-15 23:48:04 +02005106 if (q == e) {
5107 if (consumed)
5108 *consumed = size;
5109 Py_INCREF(unicode_empty);
5110 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005111 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005112
Christian Heimes743e0cd2012-10-17 23:52:17 +02005113#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005114 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005115#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005116 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005117#endif
Tim Peters772747b2001-08-09 22:21:55 +00005118
Antoine Pitrou63065d72012-05-15 23:48:04 +02005119 /* Note: size will always be longer than the resulting Unicode
5120 character count */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005121 _PyUnicodeWriter_Init(&writer, 0);
5122 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
5123 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005124
Antoine Pitrou63065d72012-05-15 23:48:04 +02005125 while (1) {
5126 Py_UCS4 ch = 0;
5127 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005128 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005129 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005130 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005131 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005132 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005133 native_ordering);
5134 else
5135 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005136 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005137 native_ordering);
5138 } else if (kind == PyUnicode_2BYTE_KIND) {
5139 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005140 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005141 native_ordering);
5142 } else {
5143 assert(kind == PyUnicode_4BYTE_KIND);
5144 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005145 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005146 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005147 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005148 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149
Antoine Pitrou63065d72012-05-15 23:48:04 +02005150 switch (ch)
5151 {
5152 case 0:
5153 /* remaining byte at the end? (size should be even) */
5154 if (q == e || consumed)
5155 goto End;
5156 errmsg = "truncated data";
5157 startinpos = ((const char *)q) - starts;
5158 endinpos = ((const char *)e) - starts;
5159 break;
5160 /* The remaining input chars are ignored if the callback
5161 chooses to skip the input */
5162 case 1:
5163 errmsg = "unexpected end of data";
5164 startinpos = ((const char *)q) - 2 - starts;
5165 endinpos = ((const char *)e) - starts;
5166 break;
5167 case 2:
5168 errmsg = "illegal encoding";
5169 startinpos = ((const char *)q) - 2 - starts;
5170 endinpos = startinpos + 2;
5171 break;
5172 case 3:
5173 errmsg = "illegal UTF-16 surrogate";
5174 startinpos = ((const char *)q) - 4 - starts;
5175 endinpos = startinpos + 2;
5176 break;
5177 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005178 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005179 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005180 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
5181 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 continue;
5183 }
5184
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005185 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005186 errors,
5187 &errorHandler,
5188 "utf16", errmsg,
5189 &starts,
5190 (const char **)&e,
5191 &startinpos,
5192 &endinpos,
5193 &exc,
5194 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005195 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 }
5198
Antoine Pitrou63065d72012-05-15 23:48:04 +02005199End:
Walter Dörwald69652032004-09-07 20:24:22 +00005200 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005201 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005202
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005203 Py_XDECREF(errorHandler);
5204 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005205 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005208 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005209 Py_XDECREF(errorHandler);
5210 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 return NULL;
5212}
5213
Tim Peters772747b2001-08-09 22:21:55 +00005214PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005215_PyUnicode_EncodeUTF16(PyObject *str,
5216 const char *errors,
5217 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005219 enum PyUnicode_Kind kind;
5220 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005221 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005222 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005223 unsigned short *out;
5224 Py_ssize_t bytesize;
5225 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005226#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005227 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005228#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005229 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005230#endif
5231
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005232 if (!PyUnicode_Check(str)) {
5233 PyErr_BadArgument();
5234 return NULL;
5235 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005236 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005237 return NULL;
5238 kind = PyUnicode_KIND(str);
5239 data = PyUnicode_DATA(str);
5240 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005241
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005242 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005243 if (kind == PyUnicode_4BYTE_KIND) {
5244 const Py_UCS4 *in = (const Py_UCS4 *)data;
5245 const Py_UCS4 *end = in + len;
5246 while (in < end)
5247 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005248 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005249 }
5250 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005252 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005253 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 if (v == NULL)
5255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005257 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005258 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005259 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005261 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005262 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005263 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005264
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005265 switch (kind) {
5266 case PyUnicode_1BYTE_KIND: {
5267 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5268 break;
Tim Peters772747b2001-08-09 22:21:55 +00005269 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005270 case PyUnicode_2BYTE_KIND: {
5271 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5272 break;
Tim Peters772747b2001-08-09 22:21:55 +00005273 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005274 case PyUnicode_4BYTE_KIND: {
5275 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5276 break;
5277 }
5278 default:
5279 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005280 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005281
5282 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005283 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284}
5285
Alexander Belopolsky40018472011-02-26 01:02:56 +00005286PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005287PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5288 Py_ssize_t size,
5289 const char *errors,
5290 int byteorder)
5291{
5292 PyObject *result;
5293 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5294 if (tmp == NULL)
5295 return NULL;
5296 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5297 Py_DECREF(tmp);
5298 return result;
5299}
5300
5301PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005302PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005304 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305}
5306
5307/* --- Unicode Escape Codec ----------------------------------------------- */
5308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005309/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5310 if all the escapes in the string make it still a valid ASCII string.
5311 Returns -1 if any escapes were found which cause the string to
5312 pop out of ASCII range. Otherwise returns the length of the
5313 required buffer to hold the string.
5314 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005315static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005316length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5317{
5318 const unsigned char *p = (const unsigned char *)s;
5319 const unsigned char *end = p + size;
5320 Py_ssize_t length = 0;
5321
5322 if (size < 0)
5323 return -1;
5324
5325 for (; p < end; ++p) {
5326 if (*p > 127) {
5327 /* Non-ASCII */
5328 return -1;
5329 }
5330 else if (*p != '\\') {
5331 /* Normal character */
5332 ++length;
5333 }
5334 else {
5335 /* Backslash-escape, check next char */
5336 ++p;
5337 /* Escape sequence reaches till end of string or
5338 non-ASCII follow-up. */
5339 if (p >= end || *p > 127)
5340 return -1;
5341 switch (*p) {
5342 case '\n':
5343 /* backslash + \n result in zero characters */
5344 break;
5345 case '\\': case '\'': case '\"':
5346 case 'b': case 'f': case 't':
5347 case 'n': case 'r': case 'v': case 'a':
5348 ++length;
5349 break;
5350 case '0': case '1': case '2': case '3':
5351 case '4': case '5': case '6': case '7':
5352 case 'x': case 'u': case 'U': case 'N':
5353 /* these do not guarantee ASCII characters */
5354 return -1;
5355 default:
5356 /* count the backslash + the other character */
5357 length += 2;
5358 }
5359 }
5360 }
5361 return length;
5362}
5363
Fredrik Lundh06d12682001-01-24 07:59:11 +00005364static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005365
Alexander Belopolsky40018472011-02-26 01:02:56 +00005366PyObject *
5367PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005368 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005369 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005371 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005372 Py_ssize_t startinpos;
5373 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005374 int j;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005375 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005377 char* message;
5378 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005379 PyObject *errorHandler = NULL;
5380 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005381 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005382
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005383 len = length_of_escaped_ascii_string(s, size);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005384 if (len == 0) {
5385 Py_INCREF(unicode_empty);
5386 return unicode_empty;
5387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005388
5389 /* After length_of_escaped_ascii_string() there are two alternatives,
5390 either the string is pure ASCII with named escapes like \n, etc.
5391 and we determined it's exact size (common case)
5392 or it contains \x, \u, ... escape sequences. then we create a
5393 legacy wchar string and resize it at the end of this function. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005394 _PyUnicodeWriter_Init(&writer, 0);
5395 if (len > 0) {
5396 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005397 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005398 assert(writer.kind == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005399 }
5400 else {
5401 /* Escaped strings will always be longer than the resulting
5402 Unicode string, so we start with size here and then reduce the
5403 length after conversion to the true value.
5404 (but if the error callback returns a long replacement string
5405 we'll have to allocate more space) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005406 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005407 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005408 }
5409
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005411 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005413
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 while (s < end) {
5415 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005416 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005417 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418
5419 /* Non-escape characters are interpreted as Unicode ordinals */
5420 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005421 x = (unsigned char)*s;
5422 s++;
5423 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005424 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005425 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5426 writer.pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 continue;
5428 }
5429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005430 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 /* \ - Escapes */
5432 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005433 c = *s++;
5434 if (s > end)
5435 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005436
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005437 /* The only case in which i == ascii_length is a backslash
5438 followed by a newline. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005439 assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005440
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005441 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442
Benjamin Peterson29060642009-01-31 22:14:21 +00005443 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005444#define WRITECHAR(ch) \
5445 do { \
5446 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) \
5447 goto onError; \
5448 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); \
5449 writer.pos++; \
5450 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005451
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005453 case '\\': WRITECHAR('\\'); break;
5454 case '\'': WRITECHAR('\''); break;
5455 case '\"': WRITECHAR('\"'); break;
5456 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005457 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005458 case 'f': WRITECHAR('\014'); break;
5459 case 't': WRITECHAR('\t'); break;
5460 case 'n': WRITECHAR('\n'); break;
5461 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005462 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005463 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005464 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005465 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 case '0': case '1': case '2': case '3':
5469 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005470 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005471 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005472 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005473 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005474 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005476 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 break;
5478
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 /* hex escapes */
5480 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005482 digits = 2;
5483 message = "truncated \\xXX escape";
5484 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005488 digits = 4;
5489 message = "truncated \\uXXXX escape";
5490 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005493 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005494 digits = 8;
5495 message = "truncated \\UXXXXXXXX escape";
5496 hexescape:
5497 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005498 if (s+digits>end) {
5499 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005500 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 errors, &errorHandler,
5502 "unicodeescape", "end of string in escape sequence",
5503 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005504 &writer))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 goto onError;
5506 goto nextByte;
5507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005508 for (j = 0; j < digits; ++j) {
5509 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005510 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005511 endinpos = (s+j+1)-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005512 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 errors, &errorHandler,
5514 "unicodeescape", message,
5515 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005516 &writer))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005517 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005518 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005519 }
5520 chr = (chr<<4) & ~0xF;
5521 if (c >= '0' && c <= '9')
5522 chr += c - '0';
5523 else if (c >= 'a' && c <= 'f')
5524 chr += 10 + c - 'a';
5525 else
5526 chr += 10 + c - 'A';
5527 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005528 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005529 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005530 /* _decoding_error will have already written into the
5531 target buffer. */
5532 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005533 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005534 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005535 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005536 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005537 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005538 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005539 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 errors, &errorHandler,
5541 "unicodeescape", "illegal Unicode character",
5542 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005543 &writer))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005544 goto onError;
5545 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005546 break;
5547
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005549 case 'N':
5550 message = "malformed \\N character escape";
5551 if (ucnhash_CAPI == NULL) {
5552 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005553 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5554 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005555 if (ucnhash_CAPI == NULL)
5556 goto ucnhashError;
5557 }
5558 if (*s == '{') {
5559 const char *start = s+1;
5560 /* look for the closing brace */
5561 while (*s != '}' && s < end)
5562 s++;
5563 if (s > start && s < end && *s == '}') {
5564 /* found a name. look it up in the unicode database */
5565 message = "unknown Unicode character name";
5566 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005567 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005568 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005569 goto store;
5570 }
5571 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005573 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 errors, &errorHandler,
5575 "unicodeescape", message,
5576 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005577 &writer))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005578 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005579 break;
5580
5581 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005582 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005583 message = "\\ at end of string";
5584 s--;
5585 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005586 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 errors, &errorHandler,
5588 "unicodeescape", message,
5589 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005590 &writer))
Walter Dörwald8c077222002-03-25 11:16:18 +00005591 goto onError;
5592 }
5593 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005594 WRITECHAR('\\');
5595 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005596 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005597 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005602#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005603
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005604 Py_XDECREF(errorHandler);
5605 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005606 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005607
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005609 PyErr_SetString(
5610 PyExc_UnicodeError,
5611 "\\N escapes not supported (can't load unicodedata module)"
5612 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005613 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005614 Py_XDECREF(errorHandler);
5615 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005616 return NULL;
5617
Benjamin Peterson29060642009-01-31 22:14:21 +00005618 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005619 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620 Py_XDECREF(errorHandler);
5621 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 return NULL;
5623}
5624
5625/* Return a Unicode-Escape string version of the Unicode object.
5626
5627 If quotes is true, the string is enclosed in u"" or u'' quotes as
5628 appropriate.
5629
5630*/
5631
Alexander Belopolsky40018472011-02-26 01:02:56 +00005632PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005633PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005635 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005636 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005638 int kind;
5639 void *data;
5640 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641
Ezio Melottie7f90372012-10-05 03:33:31 +03005642 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005643 escape.
5644
Ezio Melottie7f90372012-10-05 03:33:31 +03005645 For UCS1 strings it's '\xxx', 4 bytes per source character.
5646 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5647 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005648 */
5649
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005650 if (!PyUnicode_Check(unicode)) {
5651 PyErr_BadArgument();
5652 return NULL;
5653 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005654 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005655 return NULL;
5656 len = PyUnicode_GET_LENGTH(unicode);
5657 kind = PyUnicode_KIND(unicode);
5658 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005659 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005660 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5661 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5662 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5663 }
5664
5665 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005666 return PyBytes_FromStringAndSize(NULL, 0);
5667
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005668 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005670
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005671 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005673 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 if (repr == NULL)
5676 return NULL;
5677
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005678 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005680 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005681 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005682
Walter Dörwald79e913e2007-05-12 11:08:06 +00005683 /* Escape backslashes */
5684 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 *p++ = '\\';
5686 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005687 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005688 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005689
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005690 /* Map 21-bit characters to '\U00xxxxxx' */
5691 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005692 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005693 *p++ = '\\';
5694 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005695 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5696 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5697 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5698 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5699 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5700 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5701 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5702 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005704 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005705
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005707 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 *p++ = '\\';
5709 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005710 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5711 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5712 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5713 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005715
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005716 /* Map special whitespace to '\t', \n', '\r' */
5717 else if (ch == '\t') {
5718 *p++ = '\\';
5719 *p++ = 't';
5720 }
5721 else if (ch == '\n') {
5722 *p++ = '\\';
5723 *p++ = 'n';
5724 }
5725 else if (ch == '\r') {
5726 *p++ = '\\';
5727 *p++ = 'r';
5728 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005729
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005730 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005731 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005733 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005734 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5735 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005736 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005737
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 /* Copy everything else as-is */
5739 else
5740 *p++ = (char) ch;
5741 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005743 assert(p - PyBytes_AS_STRING(repr) > 0);
5744 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5745 return NULL;
5746 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747}
5748
Alexander Belopolsky40018472011-02-26 01:02:56 +00005749PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005750PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5751 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005753 PyObject *result;
5754 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5755 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005757 result = PyUnicode_AsUnicodeEscapeString(tmp);
5758 Py_DECREF(tmp);
5759 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760}
5761
5762/* --- Raw Unicode Escape Codec ------------------------------------------- */
5763
Alexander Belopolsky40018472011-02-26 01:02:56 +00005764PyObject *
5765PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005766 Py_ssize_t size,
5767 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005770 Py_ssize_t startinpos;
5771 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005772 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 const char *end;
5774 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 PyObject *errorHandler = NULL;
5776 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005777
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005778 if (size == 0) {
5779 Py_INCREF(unicode_empty);
5780 return unicode_empty;
5781 }
5782
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 /* Escaped strings will always be longer than the resulting
5784 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005785 length after conversion to the true value. (But decoding error
5786 handler might have to resize the string) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005787 _PyUnicodeWriter_Init(&writer, 1);
5788 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005790
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 end = s + size;
5792 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 unsigned char c;
5794 Py_UCS4 x;
5795 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005796 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 /* Non-escape characters are interpreted as Unicode ordinals */
5799 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005800 x = (unsigned char)*s++;
5801 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005802 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005803 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5804 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005806 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 startinpos = s-starts;
5808
5809 /* \u-escapes are only interpreted iff the number of leading
5810 backslashes if odd */
5811 bs = s;
5812 for (;s < end;) {
5813 if (*s != '\\')
5814 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005815 x = (unsigned char)*s++;
5816 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005817 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005818 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5819 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 }
5821 if (((s - bs) & 1) == 0 ||
5822 s >= end ||
5823 (*s != 'u' && *s != 'U')) {
5824 continue;
5825 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005826 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005827 count = *s=='u' ? 4 : 8;
5828 s++;
5829
5830 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 for (x = 0, i = 0; i < count; ++i, ++s) {
5832 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005833 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005835 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 errors, &errorHandler,
5837 "rawunicodeescape", "truncated \\uXXXX",
5838 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005839 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 goto onError;
5841 goto nextByte;
5842 }
5843 x = (x<<4) & ~0xF;
5844 if (c >= '0' && c <= '9')
5845 x += c - '0';
5846 else if (c >= 'a' && c <= 'f')
5847 x += 10 + c - 'a';
5848 else
5849 x += 10 + c - 'A';
5850 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005851 if (x <= MAX_UNICODE) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005852 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005853 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005854 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5855 writer.pos++;
5856 }
5857 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005858 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005859 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005860 errors, &errorHandler,
5861 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005862 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005863 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005864 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005865 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 nextByte:
5867 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005869 Py_XDECREF(errorHandler);
5870 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005871 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005872
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005874 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005875 Py_XDECREF(errorHandler);
5876 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 return NULL;
5878}
5879
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005880
Alexander Belopolsky40018472011-02-26 01:02:56 +00005881PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005884 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885 char *p;
5886 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005887 Py_ssize_t expandsize, pos;
5888 int kind;
5889 void *data;
5890 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 if (!PyUnicode_Check(unicode)) {
5893 PyErr_BadArgument();
5894 return NULL;
5895 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005896 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005897 return NULL;
5898 kind = PyUnicode_KIND(unicode);
5899 data = PyUnicode_DATA(unicode);
5900 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005901 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5902 bytes, and 1 byte characters 4. */
5903 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005904
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005905 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005907
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005908 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 if (repr == NULL)
5910 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005911 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005912 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005914 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005915 for (pos = 0; pos < len; pos++) {
5916 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 /* Map 32-bit characters to '\Uxxxxxxxx' */
5918 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005919 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005920 *p++ = '\\';
5921 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005922 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5923 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5924 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5925 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5926 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5927 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5928 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5929 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005930 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005932 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 *p++ = '\\';
5934 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005935 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5936 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5937 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5938 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 /* Copy everything else as-is */
5941 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 *p++ = (char) ch;
5943 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005944
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005945 assert(p > q);
5946 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005947 return NULL;
5948 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949}
5950
Alexander Belopolsky40018472011-02-26 01:02:56 +00005951PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005952PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5953 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005955 PyObject *result;
5956 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5957 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005958 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005959 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5960 Py_DECREF(tmp);
5961 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962}
5963
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005964/* --- Unicode Internal Codec ------------------------------------------- */
5965
Alexander Belopolsky40018472011-02-26 01:02:56 +00005966PyObject *
5967_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005968 Py_ssize_t size,
5969 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005970{
5971 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005972 Py_ssize_t startinpos;
5973 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005974 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005975 const char *end;
5976 const char *reason;
5977 PyObject *errorHandler = NULL;
5978 PyObject *exc = NULL;
5979
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005980 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005981 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005982 1))
5983 return NULL;
5984
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005985 if (size == 0) {
5986 Py_INCREF(unicode_empty);
5987 return unicode_empty;
5988 }
5989
Thomas Wouters89f507f2006-12-13 04:49:30 +00005990 /* XXX overflow detection missing */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005991 _PyUnicodeWriter_Init(&writer, 0);
5992 if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005994 end = s + size;
5995
5996 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005997 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005998 Py_UCS4 ch;
5999 /* We copy the raw representation one byte at a time because the
6000 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006001 ((char *) &uch)[0] = s[0];
6002 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006003#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006004 ((char *) &uch)[2] = s[2];
6005 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006006#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006007 ch = uch;
6008
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006009 /* We have to sanity check the raw data, otherwise doom looms for
6010 some malformed UCS-4 data. */
6011 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006012#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006013 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006014#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006015 end-s < Py_UNICODE_SIZE
6016 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006018 startinpos = s - starts;
6019 if (end-s < Py_UNICODE_SIZE) {
6020 endinpos = end-starts;
6021 reason = "truncated input";
6022 }
6023 else {
6024 endinpos = s - starts + Py_UNICODE_SIZE;
6025 reason = "illegal code point (> 0x10FFFF)";
6026 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006027 if (unicode_decode_call_errorhandler_writer(
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006028 errors, &errorHandler,
6029 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006030 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006031 &writer))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006032 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006033 continue;
6034 }
6035
6036 s += Py_UNICODE_SIZE;
6037#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006038 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006039 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006040 Py_UNICODE uch2;
6041 ((char *) &uch2)[0] = s[0];
6042 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006043 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006044 {
Victor Stinner551ac952011-11-29 22:58:13 +01006045 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006046 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006047 }
6048 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006049#endif
6050
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006051 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006052 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006053 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
6054 writer.pos++;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006055 }
6056
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006057 Py_XDECREF(errorHandler);
6058 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006059 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006060
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006062 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006063 Py_XDECREF(errorHandler);
6064 Py_XDECREF(exc);
6065 return NULL;
6066}
6067
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068/* --- Latin-1 Codec ------------------------------------------------------ */
6069
Alexander Belopolsky40018472011-02-26 01:02:56 +00006070PyObject *
6071PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006072 Py_ssize_t size,
6073 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006076 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077}
6078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006080static void
6081make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006082 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006083 PyObject *unicode,
6084 Py_ssize_t startpos, Py_ssize_t endpos,
6085 const char *reason)
6086{
6087 if (*exceptionObject == NULL) {
6088 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006089 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006090 encoding, unicode, startpos, endpos, reason);
6091 }
6092 else {
6093 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6094 goto onError;
6095 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6096 goto onError;
6097 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6098 goto onError;
6099 return;
6100 onError:
6101 Py_DECREF(*exceptionObject);
6102 *exceptionObject = NULL;
6103 }
6104}
6105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006107static void
6108raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006109 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006110 PyObject *unicode,
6111 Py_ssize_t startpos, Py_ssize_t endpos,
6112 const char *reason)
6113{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006114 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006115 encoding, unicode, startpos, endpos, reason);
6116 if (*exceptionObject != NULL)
6117 PyCodec_StrictErrors(*exceptionObject);
6118}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119
6120/* error handling callback helper:
6121 build arguments, call the callback and check the arguments,
6122 put the result into newpos and return the replacement string, which
6123 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006124static PyObject *
6125unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006126 PyObject **errorHandler,
6127 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006128 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006129 Py_ssize_t startpos, Py_ssize_t endpos,
6130 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006132 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006133 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 PyObject *restuple;
6135 PyObject *resunicode;
6136
6137 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 }
6142
Benjamin Petersonbac79492012-01-14 13:34:47 -05006143 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006144 return NULL;
6145 len = PyUnicode_GET_LENGTH(unicode);
6146
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006147 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006148 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006149 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006150 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006151
6152 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006156 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006157 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 Py_DECREF(restuple);
6159 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006160 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006161 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 &resunicode, newpos)) {
6163 Py_DECREF(restuple);
6164 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006165 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006166 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6167 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6168 Py_DECREF(restuple);
6169 return NULL;
6170 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006172 *newpos = len + *newpos;
6173 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6175 Py_DECREF(restuple);
6176 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006177 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006178 Py_INCREF(resunicode);
6179 Py_DECREF(restuple);
6180 return resunicode;
6181}
6182
Alexander Belopolsky40018472011-02-26 01:02:56 +00006183static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006184unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006185 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006186 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006187{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006188 /* input state */
6189 Py_ssize_t pos=0, size;
6190 int kind;
6191 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192 /* output object */
6193 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006194 /* pointer into the output */
6195 char *str;
6196 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006197 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006198 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6199 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006200 PyObject *errorHandler = NULL;
6201 PyObject *exc = NULL;
6202 /* the following variable is used for caching string comparisons
6203 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6204 int known_errorHandler = -1;
6205
Benjamin Petersonbac79492012-01-14 13:34:47 -05006206 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006207 return NULL;
6208 size = PyUnicode_GET_LENGTH(unicode);
6209 kind = PyUnicode_KIND(unicode);
6210 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006211 /* allocate enough for a simple encoding without
6212 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006213 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006214 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006215 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006216 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006217 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006218 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006219 ressize = size;
6220
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006221 while (pos < size) {
6222 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006223
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 /* can we encode this? */
6225 if (c<limit) {
6226 /* no overflow check, because we know that the space is enough */
6227 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006228 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006229 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 Py_ssize_t requiredsize;
6232 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006233 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006235 Py_ssize_t collstart = pos;
6236 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006238 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 ++collend;
6240 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6241 if (known_errorHandler==-1) {
6242 if ((errors==NULL) || (!strcmp(errors, "strict")))
6243 known_errorHandler = 1;
6244 else if (!strcmp(errors, "replace"))
6245 known_errorHandler = 2;
6246 else if (!strcmp(errors, "ignore"))
6247 known_errorHandler = 3;
6248 else if (!strcmp(errors, "xmlcharrefreplace"))
6249 known_errorHandler = 4;
6250 else
6251 known_errorHandler = 0;
6252 }
6253 switch (known_errorHandler) {
6254 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006255 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 goto onError;
6257 case 2: /* replace */
6258 while (collstart++<collend)
6259 *str++ = '?'; /* fall through */
6260 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006261 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 break;
6263 case 4: /* xmlcharrefreplace */
6264 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006265 /* determine replacement size */
6266 for (i = collstart, repsize = 0; i < collend; ++i) {
6267 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6268 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006270 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006272 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006274 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006276 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006278 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006280 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006281 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006283 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006285 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 if (requiredsize > ressize) {
6287 if (requiredsize<2*ressize)
6288 requiredsize = 2*ressize;
6289 if (_PyBytes_Resize(&res, requiredsize))
6290 goto onError;
6291 str = PyBytes_AS_STRING(res) + respos;
6292 ressize = requiredsize;
6293 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006294 /* generate replacement */
6295 for (i = collstart; i < collend; ++i) {
6296 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006298 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 break;
6300 default:
6301 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006302 encoding, reason, unicode, &exc,
6303 collstart, collend, &newpos);
6304 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006305 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006307 if (PyBytes_Check(repunicode)) {
6308 /* Directly copy bytes result to output. */
6309 repsize = PyBytes_Size(repunicode);
6310 if (repsize > 1) {
6311 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006312 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006313 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6314 Py_DECREF(repunicode);
6315 goto onError;
6316 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006317 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006318 ressize += repsize-1;
6319 }
6320 memcpy(str, PyBytes_AsString(repunicode), repsize);
6321 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006322 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006323 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006324 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006325 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 /* need more space? (at least enough for what we
6327 have+the replacement+the rest of the string, so
6328 we won't have to check space for encodable characters) */
6329 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006330 repsize = PyUnicode_GET_LENGTH(repunicode);
6331 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 if (requiredsize > ressize) {
6333 if (requiredsize<2*ressize)
6334 requiredsize = 2*ressize;
6335 if (_PyBytes_Resize(&res, requiredsize)) {
6336 Py_DECREF(repunicode);
6337 goto onError;
6338 }
6339 str = PyBytes_AS_STRING(res) + respos;
6340 ressize = requiredsize;
6341 }
6342 /* check if there is anything unencodable in the replacement
6343 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006344 for (i = 0; repsize-->0; ++i, ++str) {
6345 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006347 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006348 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 Py_DECREF(repunicode);
6350 goto onError;
6351 }
6352 *str = (char)c;
6353 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006354 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006355 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006356 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006357 }
6358 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006359 /* Resize if we allocated to much */
6360 size = str - PyBytes_AS_STRING(res);
6361 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006362 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006363 if (_PyBytes_Resize(&res, size) < 0)
6364 goto onError;
6365 }
6366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367 Py_XDECREF(errorHandler);
6368 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006369 return res;
6370
6371 onError:
6372 Py_XDECREF(res);
6373 Py_XDECREF(errorHandler);
6374 Py_XDECREF(exc);
6375 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006376}
6377
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006378/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006379PyObject *
6380PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006381 Py_ssize_t size,
6382 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006384 PyObject *result;
6385 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6386 if (unicode == NULL)
6387 return NULL;
6388 result = unicode_encode_ucs1(unicode, errors, 256);
6389 Py_DECREF(unicode);
6390 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391}
6392
Alexander Belopolsky40018472011-02-26 01:02:56 +00006393PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006394_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395{
6396 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 PyErr_BadArgument();
6398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006400 if (PyUnicode_READY(unicode) == -1)
6401 return NULL;
6402 /* Fast path: if it is a one-byte string, construct
6403 bytes object directly. */
6404 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6405 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6406 PyUnicode_GET_LENGTH(unicode));
6407 /* Non-Latin-1 characters present. Defer to above function to
6408 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006410}
6411
6412PyObject*
6413PyUnicode_AsLatin1String(PyObject *unicode)
6414{
6415 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416}
6417
6418/* --- 7-bit ASCII Codec -------------------------------------------------- */
6419
Alexander Belopolsky40018472011-02-26 01:02:56 +00006420PyObject *
6421PyUnicode_DecodeASCII(const char *s,
6422 Py_ssize_t size,
6423 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006426 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006427 int kind;
6428 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006429 Py_ssize_t startinpos;
6430 Py_ssize_t endinpos;
6431 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 const char *e;
6433 PyObject *errorHandler = NULL;
6434 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006435
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006436 if (size == 0) {
6437 Py_INCREF(unicode_empty);
6438 return unicode_empty;
6439 }
6440
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006442 if (size == 1 && (unsigned char)s[0] < 128)
6443 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006444
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006445 _PyUnicodeWriter_Init(&writer, 0);
6446 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006450 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006451 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006452 writer.pos = outpos;
6453 if (writer.pos == size)
6454 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006455
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006456 s += writer.pos;
6457 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006458 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006459 register unsigned char c = (unsigned char)*s;
6460 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006461 PyUnicode_WRITE(kind, data, writer.pos, c);
6462 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 ++s;
6464 }
6465 else {
6466 startinpos = s-starts;
6467 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006468 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 errors, &errorHandler,
6470 "ascii", "ordinal not in range(128)",
6471 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006472 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006474 kind = writer.kind;
6475 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478 Py_XDECREF(errorHandler);
6479 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006480 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006481
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006483 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006484 Py_XDECREF(errorHandler);
6485 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 return NULL;
6487}
6488
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006489/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006490PyObject *
6491PyUnicode_EncodeASCII(const Py_UNICODE *p,
6492 Py_ssize_t size,
6493 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006495 PyObject *result;
6496 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6497 if (unicode == NULL)
6498 return NULL;
6499 result = unicode_encode_ucs1(unicode, errors, 128);
6500 Py_DECREF(unicode);
6501 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502}
6503
Alexander Belopolsky40018472011-02-26 01:02:56 +00006504PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006505_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506{
6507 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 PyErr_BadArgument();
6509 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006511 if (PyUnicode_READY(unicode) == -1)
6512 return NULL;
6513 /* Fast path: if it is an ASCII-only string, construct bytes object
6514 directly. Else defer to above function to raise the exception. */
6515 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6516 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6517 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006519}
6520
6521PyObject *
6522PyUnicode_AsASCIIString(PyObject *unicode)
6523{
6524 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525}
6526
Victor Stinner99b95382011-07-04 14:23:54 +02006527#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006528
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006529/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006530
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006531#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006532#define NEED_RETRY
6533#endif
6534
Victor Stinner3a50e702011-10-18 21:21:00 +02006535#ifndef WC_ERR_INVALID_CHARS
6536# define WC_ERR_INVALID_CHARS 0x0080
6537#endif
6538
6539static char*
6540code_page_name(UINT code_page, PyObject **obj)
6541{
6542 *obj = NULL;
6543 if (code_page == CP_ACP)
6544 return "mbcs";
6545 if (code_page == CP_UTF7)
6546 return "CP_UTF7";
6547 if (code_page == CP_UTF8)
6548 return "CP_UTF8";
6549
6550 *obj = PyBytes_FromFormat("cp%u", code_page);
6551 if (*obj == NULL)
6552 return NULL;
6553 return PyBytes_AS_STRING(*obj);
6554}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006555
Alexander Belopolsky40018472011-02-26 01:02:56 +00006556static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006557is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006558{
6559 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006560 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006561
Victor Stinner3a50e702011-10-18 21:21:00 +02006562 if (!IsDBCSLeadByteEx(code_page, *curr))
6563 return 0;
6564
6565 prev = CharPrevExA(code_page, s, curr, 0);
6566 if (prev == curr)
6567 return 1;
6568 /* FIXME: This code is limited to "true" double-byte encodings,
6569 as it assumes an incomplete character consists of a single
6570 byte. */
6571 if (curr - prev == 2)
6572 return 1;
6573 if (!IsDBCSLeadByteEx(code_page, *prev))
6574 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006575 return 0;
6576}
6577
Victor Stinner3a50e702011-10-18 21:21:00 +02006578static DWORD
6579decode_code_page_flags(UINT code_page)
6580{
6581 if (code_page == CP_UTF7) {
6582 /* The CP_UTF7 decoder only supports flags=0 */
6583 return 0;
6584 }
6585 else
6586 return MB_ERR_INVALID_CHARS;
6587}
6588
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006589/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006590 * Decode a byte string from a Windows code page into unicode object in strict
6591 * mode.
6592 *
6593 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6594 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006595 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006596static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006597decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006598 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006599 const char *in,
6600 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006601{
Victor Stinner3a50e702011-10-18 21:21:00 +02006602 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006603 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006604 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006605
6606 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006607 assert(insize > 0);
6608 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6609 if (outsize <= 0)
6610 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006611
6612 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006614 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006615 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 if (*v == NULL)
6617 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006618 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006619 }
6620 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006622 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006623 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006625 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006626 }
6627
6628 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006629 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6630 if (outsize <= 0)
6631 goto error;
6632 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006633
Victor Stinner3a50e702011-10-18 21:21:00 +02006634error:
6635 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6636 return -2;
6637 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006638 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006639}
6640
Victor Stinner3a50e702011-10-18 21:21:00 +02006641/*
6642 * Decode a byte string from a code page into unicode object with an error
6643 * handler.
6644 *
6645 * Returns consumed size if succeed, or raise a WindowsError or
6646 * UnicodeDecodeError exception and returns -1 on error.
6647 */
6648static int
6649decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006650 PyObject **v,
6651 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006652 const char *errors)
6653{
6654 const char *startin = in;
6655 const char *endin = in + size;
6656 const DWORD flags = decode_code_page_flags(code_page);
6657 /* Ideally, we should get reason from FormatMessage. This is the Windows
6658 2000 English version of the message. */
6659 const char *reason = "No mapping for the Unicode character exists "
6660 "in the target code page.";
6661 /* each step cannot decode more than 1 character, but a character can be
6662 represented as a surrogate pair */
6663 wchar_t buffer[2], *startout, *out;
6664 int insize, outsize;
6665 PyObject *errorHandler = NULL;
6666 PyObject *exc = NULL;
6667 PyObject *encoding_obj = NULL;
6668 char *encoding;
6669 DWORD err;
6670 int ret = -1;
6671
6672 assert(size > 0);
6673
6674 encoding = code_page_name(code_page, &encoding_obj);
6675 if (encoding == NULL)
6676 return -1;
6677
6678 if (errors == NULL || strcmp(errors, "strict") == 0) {
6679 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6680 UnicodeDecodeError. */
6681 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6682 if (exc != NULL) {
6683 PyCodec_StrictErrors(exc);
6684 Py_CLEAR(exc);
6685 }
6686 goto error;
6687 }
6688
6689 if (*v == NULL) {
6690 /* Create unicode object */
6691 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6692 PyErr_NoMemory();
6693 goto error;
6694 }
Victor Stinnerab595942011-12-17 04:59:06 +01006695 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006696 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006697 if (*v == NULL)
6698 goto error;
6699 startout = PyUnicode_AS_UNICODE(*v);
6700 }
6701 else {
6702 /* Extend unicode object */
6703 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6704 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6705 PyErr_NoMemory();
6706 goto error;
6707 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006708 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006709 goto error;
6710 startout = PyUnicode_AS_UNICODE(*v) + n;
6711 }
6712
6713 /* Decode the byte string character per character */
6714 out = startout;
6715 while (in < endin)
6716 {
6717 /* Decode a character */
6718 insize = 1;
6719 do
6720 {
6721 outsize = MultiByteToWideChar(code_page, flags,
6722 in, insize,
6723 buffer, Py_ARRAY_LENGTH(buffer));
6724 if (outsize > 0)
6725 break;
6726 err = GetLastError();
6727 if (err != ERROR_NO_UNICODE_TRANSLATION
6728 && err != ERROR_INSUFFICIENT_BUFFER)
6729 {
6730 PyErr_SetFromWindowsErr(0);
6731 goto error;
6732 }
6733 insize++;
6734 }
6735 /* 4=maximum length of a UTF-8 sequence */
6736 while (insize <= 4 && (in + insize) <= endin);
6737
6738 if (outsize <= 0) {
6739 Py_ssize_t startinpos, endinpos, outpos;
6740
6741 startinpos = in - startin;
6742 endinpos = startinpos + 1;
6743 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006744 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006745 errors, &errorHandler,
6746 encoding, reason,
6747 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006748 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006749 {
6750 goto error;
6751 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006752 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006753 }
6754 else {
6755 in += insize;
6756 memcpy(out, buffer, outsize * sizeof(wchar_t));
6757 out += outsize;
6758 }
6759 }
6760
6761 /* write a NUL character at the end */
6762 *out = 0;
6763
6764 /* Extend unicode object */
6765 outsize = out - startout;
6766 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006767 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006768 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006769 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006770
6771error:
6772 Py_XDECREF(encoding_obj);
6773 Py_XDECREF(errorHandler);
6774 Py_XDECREF(exc);
6775 return ret;
6776}
6777
Victor Stinner3a50e702011-10-18 21:21:00 +02006778static PyObject *
6779decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006780 const char *s, Py_ssize_t size,
6781 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006782{
Victor Stinner76a31a62011-11-04 00:05:13 +01006783 PyObject *v = NULL;
6784 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006785
Victor Stinner3a50e702011-10-18 21:21:00 +02006786 if (code_page < 0) {
6787 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6788 return NULL;
6789 }
6790
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006791 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006793
Victor Stinner76a31a62011-11-04 00:05:13 +01006794 do
6795 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006796#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006797 if (size > INT_MAX) {
6798 chunk_size = INT_MAX;
6799 final = 0;
6800 done = 0;
6801 }
6802 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006803#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006804 {
6805 chunk_size = (int)size;
6806 final = (consumed == NULL);
6807 done = 1;
6808 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006809
Victor Stinner76a31a62011-11-04 00:05:13 +01006810 /* Skip trailing lead-byte unless 'final' is set */
6811 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6812 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006813
Victor Stinner76a31a62011-11-04 00:05:13 +01006814 if (chunk_size == 0 && done) {
6815 if (v != NULL)
6816 break;
6817 Py_INCREF(unicode_empty);
6818 return unicode_empty;
6819 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006820
Victor Stinner76a31a62011-11-04 00:05:13 +01006821
6822 converted = decode_code_page_strict(code_page, &v,
6823 s, chunk_size);
6824 if (converted == -2)
6825 converted = decode_code_page_errors(code_page, &v,
6826 s, chunk_size,
6827 errors);
6828 assert(converted != 0);
6829
6830 if (converted < 0) {
6831 Py_XDECREF(v);
6832 return NULL;
6833 }
6834
6835 if (consumed)
6836 *consumed += converted;
6837
6838 s += converted;
6839 size -= converted;
6840 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006841
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006842 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843}
6844
Alexander Belopolsky40018472011-02-26 01:02:56 +00006845PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006846PyUnicode_DecodeCodePageStateful(int code_page,
6847 const char *s,
6848 Py_ssize_t size,
6849 const char *errors,
6850 Py_ssize_t *consumed)
6851{
6852 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6853}
6854
6855PyObject *
6856PyUnicode_DecodeMBCSStateful(const char *s,
6857 Py_ssize_t size,
6858 const char *errors,
6859 Py_ssize_t *consumed)
6860{
6861 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6862}
6863
6864PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006865PyUnicode_DecodeMBCS(const char *s,
6866 Py_ssize_t size,
6867 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006868{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006869 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6870}
6871
Victor Stinner3a50e702011-10-18 21:21:00 +02006872static DWORD
6873encode_code_page_flags(UINT code_page, const char *errors)
6874{
6875 if (code_page == CP_UTF8) {
6876 if (winver.dwMajorVersion >= 6)
6877 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6878 and later */
6879 return WC_ERR_INVALID_CHARS;
6880 else
6881 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6882 return 0;
6883 }
6884 else if (code_page == CP_UTF7) {
6885 /* CP_UTF7 only supports flags=0 */
6886 return 0;
6887 }
6888 else {
6889 if (errors != NULL && strcmp(errors, "replace") == 0)
6890 return 0;
6891 else
6892 return WC_NO_BEST_FIT_CHARS;
6893 }
6894}
6895
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006896/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006897 * Encode a Unicode string to a Windows code page into a byte string in strict
6898 * mode.
6899 *
6900 * Returns consumed characters if succeed, returns -2 on encode error, or raise
6901 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006903static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006904encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006905 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006906 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006907{
Victor Stinner554f3f02010-06-16 23:33:54 +00006908 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006909 BOOL *pusedDefaultChar = &usedDefaultChar;
6910 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006911 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006912 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006913 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006914 const DWORD flags = encode_code_page_flags(code_page, NULL);
6915 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006916 /* Create a substring so that we can get the UTF-16 representation
6917 of just the slice under consideration. */
6918 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006919
Martin v. Löwis3d325192011-11-04 18:23:06 +01006920 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006921
Victor Stinner3a50e702011-10-18 21:21:00 +02006922 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006923 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006924 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006925 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006926
Victor Stinner2fc507f2011-11-04 20:06:39 +01006927 substring = PyUnicode_Substring(unicode, offset, offset+len);
6928 if (substring == NULL)
6929 return -1;
6930 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6931 if (p == NULL) {
6932 Py_DECREF(substring);
6933 return -1;
6934 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006935
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006936 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006937 outsize = WideCharToMultiByte(code_page, flags,
6938 p, size,
6939 NULL, 0,
6940 NULL, pusedDefaultChar);
6941 if (outsize <= 0)
6942 goto error;
6943 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006944 if (pusedDefaultChar && *pusedDefaultChar) {
6945 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006946 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006947 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006948
Victor Stinner3a50e702011-10-18 21:21:00 +02006949 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006951 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006952 if (*outbytes == NULL) {
6953 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006955 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006956 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006957 }
6958 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006960 const Py_ssize_t n = PyBytes_Size(*outbytes);
6961 if (outsize > PY_SSIZE_T_MAX - n) {
6962 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006963 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006964 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006965 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006966 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6967 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006968 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006969 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006970 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006971 }
6972
6973 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006974 outsize = WideCharToMultiByte(code_page, flags,
6975 p, size,
6976 out, outsize,
6977 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006978 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006979 if (outsize <= 0)
6980 goto error;
6981 if (pusedDefaultChar && *pusedDefaultChar)
6982 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006983 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006984
Victor Stinner3a50e702011-10-18 21:21:00 +02006985error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006986 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006987 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6988 return -2;
6989 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006990 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006991}
6992
Victor Stinner3a50e702011-10-18 21:21:00 +02006993/*
6994 * Encode a Unicode string to a Windows code page into a byte string using a
6995 * error handler.
6996 *
6997 * Returns consumed characters if succeed, or raise a WindowsError and returns
6998 * -1 on other error.
6999 */
7000static int
7001encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007002 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007003 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007004{
Victor Stinner3a50e702011-10-18 21:21:00 +02007005 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007006 Py_ssize_t pos = unicode_offset;
7007 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007008 /* Ideally, we should get reason from FormatMessage. This is the Windows
7009 2000 English version of the message. */
7010 const char *reason = "invalid character";
7011 /* 4=maximum length of a UTF-8 sequence */
7012 char buffer[4];
7013 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7014 Py_ssize_t outsize;
7015 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007016 PyObject *errorHandler = NULL;
7017 PyObject *exc = NULL;
7018 PyObject *encoding_obj = NULL;
7019 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007020 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007021 PyObject *rep;
7022 int ret = -1;
7023
7024 assert(insize > 0);
7025
7026 encoding = code_page_name(code_page, &encoding_obj);
7027 if (encoding == NULL)
7028 return -1;
7029
7030 if (errors == NULL || strcmp(errors, "strict") == 0) {
7031 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7032 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007033 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007034 if (exc != NULL) {
7035 PyCodec_StrictErrors(exc);
7036 Py_DECREF(exc);
7037 }
7038 Py_XDECREF(encoding_obj);
7039 return -1;
7040 }
7041
7042 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7043 pusedDefaultChar = &usedDefaultChar;
7044 else
7045 pusedDefaultChar = NULL;
7046
7047 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7048 PyErr_NoMemory();
7049 goto error;
7050 }
7051 outsize = insize * Py_ARRAY_LENGTH(buffer);
7052
7053 if (*outbytes == NULL) {
7054 /* Create string object */
7055 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7056 if (*outbytes == NULL)
7057 goto error;
7058 out = PyBytes_AS_STRING(*outbytes);
7059 }
7060 else {
7061 /* Extend string object */
7062 Py_ssize_t n = PyBytes_Size(*outbytes);
7063 if (n > PY_SSIZE_T_MAX - outsize) {
7064 PyErr_NoMemory();
7065 goto error;
7066 }
7067 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7068 goto error;
7069 out = PyBytes_AS_STRING(*outbytes) + n;
7070 }
7071
7072 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007073 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007074 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007075 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7076 wchar_t chars[2];
7077 int charsize;
7078 if (ch < 0x10000) {
7079 chars[0] = (wchar_t)ch;
7080 charsize = 1;
7081 }
7082 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007083 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7084 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007085 charsize = 2;
7086 }
7087
Victor Stinner3a50e702011-10-18 21:21:00 +02007088 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007089 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007090 buffer, Py_ARRAY_LENGTH(buffer),
7091 NULL, pusedDefaultChar);
7092 if (outsize > 0) {
7093 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7094 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007095 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007096 memcpy(out, buffer, outsize);
7097 out += outsize;
7098 continue;
7099 }
7100 }
7101 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7102 PyErr_SetFromWindowsErr(0);
7103 goto error;
7104 }
7105
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 rep = unicode_encode_call_errorhandler(
7107 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007108 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007109 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007110 if (rep == NULL)
7111 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007112 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007113
7114 if (PyBytes_Check(rep)) {
7115 outsize = PyBytes_GET_SIZE(rep);
7116 if (outsize != 1) {
7117 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7118 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7119 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7120 Py_DECREF(rep);
7121 goto error;
7122 }
7123 out = PyBytes_AS_STRING(*outbytes) + offset;
7124 }
7125 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7126 out += outsize;
7127 }
7128 else {
7129 Py_ssize_t i;
7130 enum PyUnicode_Kind kind;
7131 void *data;
7132
Benjamin Petersonbac79492012-01-14 13:34:47 -05007133 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007134 Py_DECREF(rep);
7135 goto error;
7136 }
7137
7138 outsize = PyUnicode_GET_LENGTH(rep);
7139 if (outsize != 1) {
7140 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7141 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7142 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7143 Py_DECREF(rep);
7144 goto error;
7145 }
7146 out = PyBytes_AS_STRING(*outbytes) + offset;
7147 }
7148 kind = PyUnicode_KIND(rep);
7149 data = PyUnicode_DATA(rep);
7150 for (i=0; i < outsize; i++) {
7151 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7152 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007153 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007154 encoding, unicode,
7155 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 "unable to encode error handler result to ASCII");
7157 Py_DECREF(rep);
7158 goto error;
7159 }
7160 *out = (unsigned char)ch;
7161 out++;
7162 }
7163 }
7164 Py_DECREF(rep);
7165 }
7166 /* write a NUL byte */
7167 *out = 0;
7168 outsize = out - PyBytes_AS_STRING(*outbytes);
7169 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7170 if (_PyBytes_Resize(outbytes, outsize) < 0)
7171 goto error;
7172 ret = 0;
7173
7174error:
7175 Py_XDECREF(encoding_obj);
7176 Py_XDECREF(errorHandler);
7177 Py_XDECREF(exc);
7178 return ret;
7179}
7180
Victor Stinner3a50e702011-10-18 21:21:00 +02007181static PyObject *
7182encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007183 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 const char *errors)
7185{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007186 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007188 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007189 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007190
Benjamin Petersonbac79492012-01-14 13:34:47 -05007191 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007192 return NULL;
7193 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007194
Victor Stinner3a50e702011-10-18 21:21:00 +02007195 if (code_page < 0) {
7196 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7197 return NULL;
7198 }
7199
Martin v. Löwis3d325192011-11-04 18:23:06 +01007200 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007201 return PyBytes_FromStringAndSize(NULL, 0);
7202
Victor Stinner7581cef2011-11-03 22:32:33 +01007203 offset = 0;
7204 do
7205 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007206#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007207 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007208 chunks. */
7209 if (len > INT_MAX/2) {
7210 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007211 done = 0;
7212 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007213 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007214#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007215 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007216 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007217 done = 1;
7218 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007219
Victor Stinner76a31a62011-11-04 00:05:13 +01007220 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007221 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007222 errors);
7223 if (ret == -2)
7224 ret = encode_code_page_errors(code_page, &outbytes,
7225 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007226 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007227 if (ret < 0) {
7228 Py_XDECREF(outbytes);
7229 return NULL;
7230 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007231
Victor Stinner7581cef2011-11-03 22:32:33 +01007232 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007233 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007234 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007235
Victor Stinner3a50e702011-10-18 21:21:00 +02007236 return outbytes;
7237}
7238
7239PyObject *
7240PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7241 Py_ssize_t size,
7242 const char *errors)
7243{
Victor Stinner7581cef2011-11-03 22:32:33 +01007244 PyObject *unicode, *res;
7245 unicode = PyUnicode_FromUnicode(p, size);
7246 if (unicode == NULL)
7247 return NULL;
7248 res = encode_code_page(CP_ACP, unicode, errors);
7249 Py_DECREF(unicode);
7250 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007251}
7252
7253PyObject *
7254PyUnicode_EncodeCodePage(int code_page,
7255 PyObject *unicode,
7256 const char *errors)
7257{
Victor Stinner7581cef2011-11-03 22:32:33 +01007258 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007259}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007260
Alexander Belopolsky40018472011-02-26 01:02:56 +00007261PyObject *
7262PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007263{
7264 if (!PyUnicode_Check(unicode)) {
7265 PyErr_BadArgument();
7266 return NULL;
7267 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007268 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007269}
7270
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007271#undef NEED_RETRY
7272
Victor Stinner99b95382011-07-04 14:23:54 +02007273#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007274
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275/* --- Character Mapping Codec -------------------------------------------- */
7276
Alexander Belopolsky40018472011-02-26 01:02:56 +00007277PyObject *
7278PyUnicode_DecodeCharmap(const char *s,
7279 Py_ssize_t size,
7280 PyObject *mapping,
7281 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007283 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007284 Py_ssize_t startinpos;
7285 Py_ssize_t endinpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007286 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007287 _PyUnicodeWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007288 PyObject *errorHandler = NULL;
7289 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007290
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291 /* Default to Latin-1 */
7292 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007295 if (size == 0) {
7296 Py_INCREF(unicode_empty);
7297 return unicode_empty;
7298 }
7299 _PyUnicodeWriter_Init(&writer, 0);
7300 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007301 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007302
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007303 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007304 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007305 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007306 enum PyUnicode_Kind mapkind;
7307 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007308 Py_UCS4 x;
7309
Benjamin Petersonbac79492012-01-14 13:34:47 -05007310 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007311 return NULL;
7312
7313 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007314 mapdata = PyUnicode_DATA(mapping);
7315 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007317 unsigned char ch;
7318 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007319 enum PyUnicode_Kind outkind = writer.kind;
7320 void *outdata = writer.data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007321 if (outkind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007322 Py_UCS4 maxchar = writer.maxchar;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007323 while (s < e) {
7324 unsigned char ch = *s;
7325 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7326 if (x > maxchar)
7327 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007328 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, writer.pos, x);
7329 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007330 ++s;
7331 }
7332 break;
7333 }
7334 else if (outkind == PyUnicode_2BYTE_KIND) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007335 while (s < e) {
7336 unsigned char ch = *s;
7337 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7338 if (x == 0xFFFE)
7339 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007340 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, writer.pos, x);
7341 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007342 ++s;
7343 }
7344 break;
7345 }
7346 }
7347 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007350 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007351 else
7352 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007353Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007354 if (x == 0xfffe)
7355 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 startinpos = s-starts;
7358 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007359 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 errors, &errorHandler,
7361 "charmap", "character maps to <undefined>",
7362 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007363 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007364 goto onError;
7365 }
7366 continue;
7367 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007368
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007369 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007370 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007371 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
7372 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007374 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007375 }
7376 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 while (s < e) {
7378 unsigned char ch = *s;
7379 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007380
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7382 w = PyLong_FromLong((long)ch);
7383 if (w == NULL)
7384 goto onError;
7385 x = PyObject_GetItem(mapping, w);
7386 Py_DECREF(w);
7387 if (x == NULL) {
7388 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7389 /* No mapping found means: mapping is undefined. */
7390 PyErr_Clear();
7391 x = Py_None;
7392 Py_INCREF(x);
7393 } else
7394 goto onError;
7395 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007396
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 /* Apply mapping */
7398 if (PyLong_Check(x)) {
7399 long value = PyLong_AS_LONG(x);
Antoine Pitroua1f76552012-09-23 20:00:04 +02007400 if (value < 0 || value > MAX_UNICODE) {
7401 PyErr_Format(PyExc_TypeError,
7402 "character mapping must be in range(0x%lx)",
7403 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007404 Py_DECREF(x);
7405 goto onError;
7406 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007407
7408 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007409 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007410 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7411 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 }
7413 else if (x == Py_None) {
7414 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 startinpos = s-starts;
7416 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007417 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 errors, &errorHandler,
7419 "charmap", "character maps to <undefined>",
7420 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007421 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 Py_DECREF(x);
7423 goto onError;
7424 }
7425 Py_DECREF(x);
7426 continue;
7427 }
7428 else if (PyUnicode_Check(x)) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007429 writer.overallocate = 1;
7430 if (_PyUnicodeWriter_WriteStr(&writer, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007431 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 }
7433 else {
7434 /* wrong return value */
7435 PyErr_SetString(PyExc_TypeError,
7436 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007437 Py_DECREF(x);
7438 goto onError;
7439 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007440 Py_DECREF(x);
7441 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007444 Py_XDECREF(errorHandler);
7445 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007446 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007447
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007449 Py_XDECREF(errorHandler);
7450 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007451 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452 return NULL;
7453}
7454
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007455/* Charmap encoding: the lookup table */
7456
Alexander Belopolsky40018472011-02-26 01:02:56 +00007457struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 PyObject_HEAD
7459 unsigned char level1[32];
7460 int count2, count3;
7461 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007462};
7463
7464static PyObject*
7465encoding_map_size(PyObject *obj, PyObject* args)
7466{
7467 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007468 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007470}
7471
7472static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007473 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 PyDoc_STR("Return the size (in bytes) of this object") },
7475 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007476};
7477
7478static void
7479encoding_map_dealloc(PyObject* o)
7480{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007481 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007482}
7483
7484static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007485 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 "EncodingMap", /*tp_name*/
7487 sizeof(struct encoding_map), /*tp_basicsize*/
7488 0, /*tp_itemsize*/
7489 /* methods */
7490 encoding_map_dealloc, /*tp_dealloc*/
7491 0, /*tp_print*/
7492 0, /*tp_getattr*/
7493 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007494 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 0, /*tp_repr*/
7496 0, /*tp_as_number*/
7497 0, /*tp_as_sequence*/
7498 0, /*tp_as_mapping*/
7499 0, /*tp_hash*/
7500 0, /*tp_call*/
7501 0, /*tp_str*/
7502 0, /*tp_getattro*/
7503 0, /*tp_setattro*/
7504 0, /*tp_as_buffer*/
7505 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7506 0, /*tp_doc*/
7507 0, /*tp_traverse*/
7508 0, /*tp_clear*/
7509 0, /*tp_richcompare*/
7510 0, /*tp_weaklistoffset*/
7511 0, /*tp_iter*/
7512 0, /*tp_iternext*/
7513 encoding_map_methods, /*tp_methods*/
7514 0, /*tp_members*/
7515 0, /*tp_getset*/
7516 0, /*tp_base*/
7517 0, /*tp_dict*/
7518 0, /*tp_descr_get*/
7519 0, /*tp_descr_set*/
7520 0, /*tp_dictoffset*/
7521 0, /*tp_init*/
7522 0, /*tp_alloc*/
7523 0, /*tp_new*/
7524 0, /*tp_free*/
7525 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007526};
7527
7528PyObject*
7529PyUnicode_BuildEncodingMap(PyObject* string)
7530{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007531 PyObject *result;
7532 struct encoding_map *mresult;
7533 int i;
7534 int need_dict = 0;
7535 unsigned char level1[32];
7536 unsigned char level2[512];
7537 unsigned char *mlevel1, *mlevel2, *mlevel3;
7538 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007539 int kind;
7540 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007541 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007542 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007543
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007544 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007545 PyErr_BadArgument();
7546 return NULL;
7547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007548 kind = PyUnicode_KIND(string);
7549 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007550 length = PyUnicode_GET_LENGTH(string);
7551 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007552 memset(level1, 0xFF, sizeof level1);
7553 memset(level2, 0xFF, sizeof level2);
7554
7555 /* If there isn't a one-to-one mapping of NULL to \0,
7556 or if there are non-BMP characters, we need to use
7557 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007558 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007559 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007560 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007561 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007562 ch = PyUnicode_READ(kind, data, i);
7563 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007564 need_dict = 1;
7565 break;
7566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007567 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007568 /* unmapped character */
7569 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007570 l1 = ch >> 11;
7571 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007572 if (level1[l1] == 0xFF)
7573 level1[l1] = count2++;
7574 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007575 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007576 }
7577
7578 if (count2 >= 0xFF || count3 >= 0xFF)
7579 need_dict = 1;
7580
7581 if (need_dict) {
7582 PyObject *result = PyDict_New();
7583 PyObject *key, *value;
7584 if (!result)
7585 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007586 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007587 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007588 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007589 if (!key || !value)
7590 goto failed1;
7591 if (PyDict_SetItem(result, key, value) == -1)
7592 goto failed1;
7593 Py_DECREF(key);
7594 Py_DECREF(value);
7595 }
7596 return result;
7597 failed1:
7598 Py_XDECREF(key);
7599 Py_XDECREF(value);
7600 Py_DECREF(result);
7601 return NULL;
7602 }
7603
7604 /* Create a three-level trie */
7605 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7606 16*count2 + 128*count3 - 1);
7607 if (!result)
7608 return PyErr_NoMemory();
7609 PyObject_Init(result, &EncodingMapType);
7610 mresult = (struct encoding_map*)result;
7611 mresult->count2 = count2;
7612 mresult->count3 = count3;
7613 mlevel1 = mresult->level1;
7614 mlevel2 = mresult->level23;
7615 mlevel3 = mresult->level23 + 16*count2;
7616 memcpy(mlevel1, level1, 32);
7617 memset(mlevel2, 0xFF, 16*count2);
7618 memset(mlevel3, 0, 128*count3);
7619 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007620 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007621 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007622 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7623 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007624 /* unmapped character */
7625 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007626 o1 = ch>>11;
7627 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007628 i2 = 16*mlevel1[o1] + o2;
7629 if (mlevel2[i2] == 0xFF)
7630 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007631 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007632 i3 = 128*mlevel2[i2] + o3;
7633 mlevel3[i3] = i;
7634 }
7635 return result;
7636}
7637
7638static int
Victor Stinner22168992011-11-20 17:09:18 +01007639encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007640{
7641 struct encoding_map *map = (struct encoding_map*)mapping;
7642 int l1 = c>>11;
7643 int l2 = (c>>7) & 0xF;
7644 int l3 = c & 0x7F;
7645 int i;
7646
Victor Stinner22168992011-11-20 17:09:18 +01007647 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007649 if (c == 0)
7650 return 0;
7651 /* level 1*/
7652 i = map->level1[l1];
7653 if (i == 0xFF) {
7654 return -1;
7655 }
7656 /* level 2*/
7657 i = map->level23[16*i+l2];
7658 if (i == 0xFF) {
7659 return -1;
7660 }
7661 /* level 3 */
7662 i = map->level23[16*map->count2 + 128*i + l3];
7663 if (i == 0) {
7664 return -1;
7665 }
7666 return i;
7667}
7668
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007669/* Lookup the character ch in the mapping. If the character
7670 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007671 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007672static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007673charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674{
Christian Heimes217cfd12007-12-02 14:31:20 +00007675 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007676 PyObject *x;
7677
7678 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007680 x = PyObject_GetItem(mapping, w);
7681 Py_DECREF(w);
7682 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007683 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7684 /* No mapping found means: mapping is undefined. */
7685 PyErr_Clear();
7686 x = Py_None;
7687 Py_INCREF(x);
7688 return x;
7689 } else
7690 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007692 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007694 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007695 long value = PyLong_AS_LONG(x);
7696 if (value < 0 || value > 255) {
7697 PyErr_SetString(PyExc_TypeError,
7698 "character mapping must be in range(256)");
7699 Py_DECREF(x);
7700 return NULL;
7701 }
7702 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007704 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 /* wrong return value */
7708 PyErr_Format(PyExc_TypeError,
7709 "character mapping must return integer, bytes or None, not %.400s",
7710 x->ob_type->tp_name);
7711 Py_DECREF(x);
7712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713 }
7714}
7715
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007716static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007717charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007718{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007719 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7720 /* exponentially overallocate to minimize reallocations */
7721 if (requiredsize < 2*outsize)
7722 requiredsize = 2*outsize;
7723 if (_PyBytes_Resize(outobj, requiredsize))
7724 return -1;
7725 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007726}
7727
Benjamin Peterson14339b62009-01-31 16:36:08 +00007728typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007730} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007731/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007732 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007733 space is available. Return a new reference to the object that
7734 was put in the output buffer, or Py_None, if the mapping was undefined
7735 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007736 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007737static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007738charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007739 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007740{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007741 PyObject *rep;
7742 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007743 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007744
Christian Heimes90aa7642007-12-19 02:45:37 +00007745 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007746 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007748 if (res == -1)
7749 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 if (outsize<requiredsize)
7751 if (charmapencode_resize(outobj, outpos, requiredsize))
7752 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007753 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 outstart[(*outpos)++] = (char)res;
7755 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007756 }
7757
7758 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007759 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007761 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 Py_DECREF(rep);
7763 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007764 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 if (PyLong_Check(rep)) {
7766 Py_ssize_t requiredsize = *outpos+1;
7767 if (outsize<requiredsize)
7768 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7769 Py_DECREF(rep);
7770 return enc_EXCEPTION;
7771 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007772 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007774 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 else {
7776 const char *repchars = PyBytes_AS_STRING(rep);
7777 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7778 Py_ssize_t requiredsize = *outpos+repsize;
7779 if (outsize<requiredsize)
7780 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7781 Py_DECREF(rep);
7782 return enc_EXCEPTION;
7783 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007784 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 memcpy(outstart + *outpos, repchars, repsize);
7786 *outpos += repsize;
7787 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007788 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007789 Py_DECREF(rep);
7790 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007791}
7792
7793/* handle an error in PyUnicode_EncodeCharmap
7794 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007795static int
7796charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007797 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007798 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007799 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007800 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007801{
7802 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007803 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007804 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007805 enum PyUnicode_Kind kind;
7806 void *data;
7807 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007808 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007809 Py_ssize_t collstartpos = *inpos;
7810 Py_ssize_t collendpos = *inpos+1;
7811 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812 char *encoding = "charmap";
7813 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007814 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007815 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007816 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007817
Benjamin Petersonbac79492012-01-14 13:34:47 -05007818 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007819 return -1;
7820 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 /* find all unencodable characters */
7822 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007823 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007824 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007825 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007826 val = encoding_map_lookup(ch, mapping);
7827 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 break;
7829 ++collendpos;
7830 continue;
7831 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007832
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007833 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7834 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 if (rep==NULL)
7836 return -1;
7837 else if (rep!=Py_None) {
7838 Py_DECREF(rep);
7839 break;
7840 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007841 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007843 }
7844 /* cache callback name lookup
7845 * (if not done yet, i.e. it's the first error) */
7846 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 if ((errors==NULL) || (!strcmp(errors, "strict")))
7848 *known_errorHandler = 1;
7849 else if (!strcmp(errors, "replace"))
7850 *known_errorHandler = 2;
7851 else if (!strcmp(errors, "ignore"))
7852 *known_errorHandler = 3;
7853 else if (!strcmp(errors, "xmlcharrefreplace"))
7854 *known_errorHandler = 4;
7855 else
7856 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007857 }
7858 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007860 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007861 return -1;
7862 case 2: /* replace */
7863 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 x = charmapencode_output('?', mapping, res, respos);
7865 if (x==enc_EXCEPTION) {
7866 return -1;
7867 }
7868 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007869 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 return -1;
7871 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007872 }
7873 /* fall through */
7874 case 3: /* ignore */
7875 *inpos = collendpos;
7876 break;
7877 case 4: /* xmlcharrefreplace */
7878 /* generate replacement (temporarily (mis)uses p) */
7879 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 char buffer[2+29+1+1];
7881 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007882 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 for (cp = buffer; *cp; ++cp) {
7884 x = charmapencode_output(*cp, mapping, res, respos);
7885 if (x==enc_EXCEPTION)
7886 return -1;
7887 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007888 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 return -1;
7890 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007891 }
7892 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007893 *inpos = collendpos;
7894 break;
7895 default:
7896 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007897 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007899 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007901 if (PyBytes_Check(repunicode)) {
7902 /* Directly copy bytes result to output. */
7903 Py_ssize_t outsize = PyBytes_Size(*res);
7904 Py_ssize_t requiredsize;
7905 repsize = PyBytes_Size(repunicode);
7906 requiredsize = *respos + repsize;
7907 if (requiredsize > outsize)
7908 /* Make room for all additional bytes. */
7909 if (charmapencode_resize(res, respos, requiredsize)) {
7910 Py_DECREF(repunicode);
7911 return -1;
7912 }
7913 memcpy(PyBytes_AsString(*res) + *respos,
7914 PyBytes_AsString(repunicode), repsize);
7915 *respos += repsize;
7916 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007917 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007918 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007919 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007920 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007921 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007922 Py_DECREF(repunicode);
7923 return -1;
7924 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007925 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007926 data = PyUnicode_DATA(repunicode);
7927 kind = PyUnicode_KIND(repunicode);
7928 for (index = 0; index < repsize; index++) {
7929 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7930 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007932 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 return -1;
7934 }
7935 else if (x==enc_FAILED) {
7936 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007937 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 return -1;
7939 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007940 }
7941 *inpos = newpos;
7942 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943 }
7944 return 0;
7945}
7946
Alexander Belopolsky40018472011-02-26 01:02:56 +00007947PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007948_PyUnicode_EncodeCharmap(PyObject *unicode,
7949 PyObject *mapping,
7950 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007952 /* output object */
7953 PyObject *res = NULL;
7954 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007955 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007956 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007958 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007959 PyObject *errorHandler = NULL;
7960 PyObject *exc = NULL;
7961 /* the following variable is used for caching string comparisons
7962 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7963 * 3=ignore, 4=xmlcharrefreplace */
7964 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965
Benjamin Petersonbac79492012-01-14 13:34:47 -05007966 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007967 return NULL;
7968 size = PyUnicode_GET_LENGTH(unicode);
7969
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970 /* Default to Latin-1 */
7971 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007972 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007974 /* allocate enough for a simple encoding without
7975 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007976 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007977 if (res == NULL)
7978 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007979 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007982 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007983 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007985 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 if (x==enc_EXCEPTION) /* error */
7987 goto onError;
7988 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007989 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 &exc,
7991 &known_errorHandler, &errorHandler, errors,
7992 &res, &respos)) {
7993 goto onError;
7994 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007995 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 else
7997 /* done with this character => adjust input position */
7998 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008001 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008002 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008003 if (_PyBytes_Resize(&res, respos) < 0)
8004 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008005
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008006 Py_XDECREF(exc);
8007 Py_XDECREF(errorHandler);
8008 return res;
8009
Benjamin Peterson29060642009-01-31 22:14:21 +00008010 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008011 Py_XDECREF(res);
8012 Py_XDECREF(exc);
8013 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 return NULL;
8015}
8016
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008017/* Deprecated */
8018PyObject *
8019PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8020 Py_ssize_t size,
8021 PyObject *mapping,
8022 const char *errors)
8023{
8024 PyObject *result;
8025 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8026 if (unicode == NULL)
8027 return NULL;
8028 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8029 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008030 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008031}
8032
Alexander Belopolsky40018472011-02-26 01:02:56 +00008033PyObject *
8034PyUnicode_AsCharmapString(PyObject *unicode,
8035 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036{
8037 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 PyErr_BadArgument();
8039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008041 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042}
8043
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008044/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008045static void
8046make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008047 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008048 Py_ssize_t startpos, Py_ssize_t endpos,
8049 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008051 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008052 *exceptionObject = _PyUnicodeTranslateError_Create(
8053 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 }
8055 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8057 goto onError;
8058 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8059 goto onError;
8060 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8061 goto onError;
8062 return;
8063 onError:
8064 Py_DECREF(*exceptionObject);
8065 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066 }
8067}
8068
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008069/* error handling callback helper:
8070 build arguments, call the callback and check the arguments,
8071 put the result into newpos and return the replacement string, which
8072 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008073static PyObject *
8074unicode_translate_call_errorhandler(const char *errors,
8075 PyObject **errorHandler,
8076 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008077 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008078 Py_ssize_t startpos, Py_ssize_t endpos,
8079 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008081 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008083 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084 PyObject *restuple;
8085 PyObject *resunicode;
8086
8087 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091 }
8092
8093 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008094 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097
8098 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008101 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008102 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008103 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 Py_DECREF(restuple);
8105 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008106 }
8107 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 &resunicode, &i_newpos)) {
8109 Py_DECREF(restuple);
8110 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008111 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008112 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008114 else
8115 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008116 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8118 Py_DECREF(restuple);
8119 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008120 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 Py_INCREF(resunicode);
8122 Py_DECREF(restuple);
8123 return resunicode;
8124}
8125
8126/* Lookup the character ch in the mapping and put the result in result,
8127 which must be decrefed by the caller.
8128 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008129static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008130charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131{
Christian Heimes217cfd12007-12-02 14:31:20 +00008132 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008133 PyObject *x;
8134
8135 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008137 x = PyObject_GetItem(mapping, w);
8138 Py_DECREF(w);
8139 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008140 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8141 /* No mapping found means: use 1:1 mapping. */
8142 PyErr_Clear();
8143 *result = NULL;
8144 return 0;
8145 } else
8146 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008147 }
8148 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008149 *result = x;
8150 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008152 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 long value = PyLong_AS_LONG(x);
8154 long max = PyUnicode_GetMax();
8155 if (value < 0 || value > max) {
8156 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008157 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 Py_DECREF(x);
8159 return -1;
8160 }
8161 *result = x;
8162 return 0;
8163 }
8164 else if (PyUnicode_Check(x)) {
8165 *result = x;
8166 return 0;
8167 }
8168 else {
8169 /* wrong return value */
8170 PyErr_SetString(PyExc_TypeError,
8171 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008172 Py_DECREF(x);
8173 return -1;
8174 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008175}
8176/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 if not reallocate and adjust various state variables.
8178 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008179static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008182{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008183 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008184 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008185 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 /* exponentially overallocate to minimize reallocations */
8187 if (requiredsize < 2 * oldsize)
8188 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008189 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8190 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008192 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008193 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008194 }
8195 return 0;
8196}
8197/* lookup the character, put the result in the output string and adjust
8198 various state variables. Return a new reference to the object that
8199 was put in the output buffer in *result, or Py_None, if the mapping was
8200 undefined (in which case no character was written).
8201 The called must decref result.
8202 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008203static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008204charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8205 PyObject *mapping, Py_UCS4 **output,
8206 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008207 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008208{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008209 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8210 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008212 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008213 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008214 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008215 }
8216 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008218 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008220 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008221 }
8222 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008223 Py_ssize_t repsize;
8224 if (PyUnicode_READY(*res) == -1)
8225 return -1;
8226 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 if (repsize==1) {
8228 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008229 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 }
8231 else if (repsize!=0) {
8232 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233 Py_ssize_t requiredsize = *opos +
8234 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008236 Py_ssize_t i;
8237 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008239 for(i = 0; i < repsize; i++)
8240 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242 }
8243 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008245 return 0;
8246}
8247
Alexander Belopolsky40018472011-02-26 01:02:56 +00008248PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008249_PyUnicode_TranslateCharmap(PyObject *input,
8250 PyObject *mapping,
8251 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008253 /* input object */
8254 char *idata;
8255 Py_ssize_t size, i;
8256 int kind;
8257 /* output buffer */
8258 Py_UCS4 *output = NULL;
8259 Py_ssize_t osize;
8260 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008261 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008262 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263 char *reason = "character maps to <undefined>";
8264 PyObject *errorHandler = NULL;
8265 PyObject *exc = NULL;
8266 /* the following variable is used for caching string comparisons
8267 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8268 * 3=ignore, 4=xmlcharrefreplace */
8269 int known_errorHandler = -1;
8270
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 PyErr_BadArgument();
8273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008276 if (PyUnicode_READY(input) == -1)
8277 return NULL;
8278 idata = (char*)PyUnicode_DATA(input);
8279 kind = PyUnicode_KIND(input);
8280 size = PyUnicode_GET_LENGTH(input);
8281 i = 0;
8282
8283 if (size == 0) {
8284 Py_INCREF(input);
8285 return input;
8286 }
8287
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008288 /* allocate enough for a simple 1:1 translation without
8289 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008290 osize = size;
8291 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8292 opos = 0;
8293 if (output == NULL) {
8294 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008298 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 /* try to encode it */
8300 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008301 if (charmaptranslate_output(input, i, mapping,
8302 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 Py_XDECREF(x);
8304 goto onError;
8305 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008306 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008308 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 else { /* untranslatable character */
8310 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8311 Py_ssize_t repsize;
8312 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008315 Py_ssize_t collstart = i;
8316 Py_ssize_t collend = i+1;
8317 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008320 while (collend < size) {
8321 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 goto onError;
8323 Py_XDECREF(x);
8324 if (x!=Py_None)
8325 break;
8326 ++collend;
8327 }
8328 /* cache callback name lookup
8329 * (if not done yet, i.e. it's the first error) */
8330 if (known_errorHandler==-1) {
8331 if ((errors==NULL) || (!strcmp(errors, "strict")))
8332 known_errorHandler = 1;
8333 else if (!strcmp(errors, "replace"))
8334 known_errorHandler = 2;
8335 else if (!strcmp(errors, "ignore"))
8336 known_errorHandler = 3;
8337 else if (!strcmp(errors, "xmlcharrefreplace"))
8338 known_errorHandler = 4;
8339 else
8340 known_errorHandler = 0;
8341 }
8342 switch (known_errorHandler) {
8343 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008344 make_translate_exception(&exc,
8345 input, collstart, collend, reason);
8346 if (exc != NULL)
8347 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008348 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 case 2: /* replace */
8350 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 for (coll = collstart; coll<collend; coll++)
8352 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 /* fall through */
8354 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 break;
8357 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008358 /* generate replacement (temporarily (mis)uses i) */
8359 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 char buffer[2+29+1+1];
8361 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8363 if (charmaptranslate_makespace(&output, &osize,
8364 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 goto onError;
8366 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008369 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 break;
8371 default:
8372 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373 reason, input, &exc,
8374 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008375 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008377 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008378 Py_DECREF(repunicode);
8379 goto onError;
8380 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 repsize = PyUnicode_GET_LENGTH(repunicode);
8383 if (charmaptranslate_makespace(&output, &osize,
8384 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 Py_DECREF(repunicode);
8386 goto onError;
8387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 for (uni2 = 0; repsize-->0; ++uni2)
8389 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8390 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008392 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008393 }
8394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008395 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8396 if (!res)
8397 goto onError;
8398 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 Py_XDECREF(exc);
8400 Py_XDECREF(errorHandler);
8401 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405 Py_XDECREF(exc);
8406 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 return NULL;
8408}
8409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008410/* Deprecated. Use PyUnicode_Translate instead. */
8411PyObject *
8412PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8413 Py_ssize_t size,
8414 PyObject *mapping,
8415 const char *errors)
8416{
Christian Heimes5f520f42012-09-11 14:03:25 +02008417 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008418 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8419 if (!unicode)
8420 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008421 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8422 Py_DECREF(unicode);
8423 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424}
8425
Alexander Belopolsky40018472011-02-26 01:02:56 +00008426PyObject *
8427PyUnicode_Translate(PyObject *str,
8428 PyObject *mapping,
8429 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430{
8431 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008432
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433 str = PyUnicode_FromObject(str);
8434 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008435 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008436 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437 Py_DECREF(str);
8438 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439}
Tim Petersced69f82003-09-16 20:30:58 +00008440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008441static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008442fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008443{
8444 /* No need to call PyUnicode_READY(self) because this function is only
8445 called as a callback from fixup() which does it already. */
8446 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8447 const int kind = PyUnicode_KIND(self);
8448 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008449 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008450 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451 Py_ssize_t i;
8452
8453 for (i = 0; i < len; ++i) {
8454 ch = PyUnicode_READ(kind, data, i);
8455 fixed = 0;
8456 if (ch > 127) {
8457 if (Py_UNICODE_ISSPACE(ch))
8458 fixed = ' ';
8459 else {
8460 const int decimal = Py_UNICODE_TODECIMAL(ch);
8461 if (decimal >= 0)
8462 fixed = '0' + decimal;
8463 }
8464 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008465 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008466 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 PyUnicode_WRITE(kind, data, i, fixed);
8468 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008469 else
8470 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 }
8473
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008474 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008475}
8476
8477PyObject *
8478_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8479{
8480 if (!PyUnicode_Check(unicode)) {
8481 PyErr_BadInternalCall();
8482 return NULL;
8483 }
8484 if (PyUnicode_READY(unicode) == -1)
8485 return NULL;
8486 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8487 /* If the string is already ASCII, just return the same string */
8488 Py_INCREF(unicode);
8489 return unicode;
8490 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008491 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492}
8493
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008494PyObject *
8495PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8496 Py_ssize_t length)
8497{
Victor Stinnerf0124502011-11-21 23:12:56 +01008498 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008499 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008500 Py_UCS4 maxchar;
8501 enum PyUnicode_Kind kind;
8502 void *data;
8503
Victor Stinner99d7ad02012-02-22 13:37:39 +01008504 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008505 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008506 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008507 if (ch > 127) {
8508 int decimal = Py_UNICODE_TODECIMAL(ch);
8509 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008510 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008511 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008512 }
8513 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008514
8515 /* Copy to a new string */
8516 decimal = PyUnicode_New(length, maxchar);
8517 if (decimal == NULL)
8518 return decimal;
8519 kind = PyUnicode_KIND(decimal);
8520 data = PyUnicode_DATA(decimal);
8521 /* Iterate over code points */
8522 for (i = 0; i < length; i++) {
8523 Py_UNICODE ch = s[i];
8524 if (ch > 127) {
8525 int decimal = Py_UNICODE_TODECIMAL(ch);
8526 if (decimal >= 0)
8527 ch = '0' + decimal;
8528 }
8529 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008531 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008532}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008533/* --- Decimal Encoder ---------------------------------------------------- */
8534
Alexander Belopolsky40018472011-02-26 01:02:56 +00008535int
8536PyUnicode_EncodeDecimal(Py_UNICODE *s,
8537 Py_ssize_t length,
8538 char *output,
8539 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008540{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008541 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008542 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008543 enum PyUnicode_Kind kind;
8544 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008545
8546 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 PyErr_BadArgument();
8548 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008549 }
8550
Victor Stinner42bf7752011-11-21 22:52:58 +01008551 unicode = PyUnicode_FromUnicode(s, length);
8552 if (unicode == NULL)
8553 return -1;
8554
Benjamin Petersonbac79492012-01-14 13:34:47 -05008555 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008556 Py_DECREF(unicode);
8557 return -1;
8558 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008559 kind = PyUnicode_KIND(unicode);
8560 data = PyUnicode_DATA(unicode);
8561
Victor Stinnerb84d7232011-11-22 01:50:07 +01008562 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008563 PyObject *exc;
8564 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008566 Py_ssize_t startpos;
8567
8568 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008569
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008571 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008572 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008574 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 decimal = Py_UNICODE_TODECIMAL(ch);
8576 if (decimal >= 0) {
8577 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008578 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 continue;
8580 }
8581 if (0 < ch && ch < 256) {
8582 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008583 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 continue;
8585 }
Victor Stinner6345be92011-11-25 20:09:01 +01008586
Victor Stinner42bf7752011-11-21 22:52:58 +01008587 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008588 exc = NULL;
8589 raise_encode_exception(&exc, "decimal", unicode,
8590 startpos, startpos+1,
8591 "invalid decimal Unicode string");
8592 Py_XDECREF(exc);
8593 Py_DECREF(unicode);
8594 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008595 }
8596 /* 0-terminate the output string */
8597 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008598 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008599 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008600}
8601
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602/* --- Helpers ------------------------------------------------------------ */
8603
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008605any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 Py_ssize_t start,
8607 Py_ssize_t end)
8608{
8609 int kind1, kind2, kind;
8610 void *buf1, *buf2;
8611 Py_ssize_t len1, len2, result;
8612
8613 kind1 = PyUnicode_KIND(s1);
8614 kind2 = PyUnicode_KIND(s2);
8615 kind = kind1 > kind2 ? kind1 : kind2;
8616 buf1 = PyUnicode_DATA(s1);
8617 buf2 = PyUnicode_DATA(s2);
8618 if (kind1 != kind)
8619 buf1 = _PyUnicode_AsKind(s1, kind);
8620 if (!buf1)
8621 return -2;
8622 if (kind2 != kind)
8623 buf2 = _PyUnicode_AsKind(s2, kind);
8624 if (!buf2) {
8625 if (kind1 != kind) PyMem_Free(buf1);
8626 return -2;
8627 }
8628 len1 = PyUnicode_GET_LENGTH(s1);
8629 len2 = PyUnicode_GET_LENGTH(s2);
8630
Victor Stinner794d5672011-10-10 03:21:36 +02008631 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008632 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008633 case PyUnicode_1BYTE_KIND:
8634 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8635 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8636 else
8637 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8638 break;
8639 case PyUnicode_2BYTE_KIND:
8640 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8641 break;
8642 case PyUnicode_4BYTE_KIND:
8643 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8644 break;
8645 default:
8646 assert(0); result = -2;
8647 }
8648 }
8649 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008650 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008651 case PyUnicode_1BYTE_KIND:
8652 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8653 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8654 else
8655 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8656 break;
8657 case PyUnicode_2BYTE_KIND:
8658 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8659 break;
8660 case PyUnicode_4BYTE_KIND:
8661 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8662 break;
8663 default:
8664 assert(0); result = -2;
8665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 }
8667
8668 if (kind1 != kind)
8669 PyMem_Free(buf1);
8670 if (kind2 != kind)
8671 PyMem_Free(buf2);
8672
8673 return result;
8674}
8675
8676Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008677_PyUnicode_InsertThousandsGrouping(
8678 PyObject *unicode, Py_ssize_t index,
8679 Py_ssize_t n_buffer,
8680 void *digits, Py_ssize_t n_digits,
8681 Py_ssize_t min_width,
8682 const char *grouping, PyObject *thousands_sep,
8683 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684{
Victor Stinner41a863c2012-02-24 00:37:51 +01008685 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008686 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008687 Py_ssize_t thousands_sep_len;
8688 Py_ssize_t len;
8689
8690 if (unicode != NULL) {
8691 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008692 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008693 }
8694 else {
8695 kind = PyUnicode_1BYTE_KIND;
8696 data = NULL;
8697 }
8698 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8699 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8700 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8701 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008702 if (thousands_sep_kind < kind) {
8703 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8704 if (!thousands_sep_data)
8705 return -1;
8706 }
8707 else {
8708 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8709 if (!data)
8710 return -1;
8711 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008712 }
8713
Benjamin Petersonead6b532011-12-20 17:23:42 -06008714 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008716 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008717 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008718 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008719 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008720 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008721 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008722 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008723 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008724 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008725 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008726 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008728 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008729 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008730 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008731 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008732 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008733 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008734 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008735 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008736 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008737 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008738 break;
8739 default:
8740 assert(0);
8741 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008743 if (unicode != NULL && thousands_sep_kind != kind) {
8744 if (thousands_sep_kind < kind)
8745 PyMem_Free(thousands_sep_data);
8746 else
8747 PyMem_Free(data);
8748 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008749 if (unicode == NULL) {
8750 *maxchar = 127;
8751 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008752 *maxchar = MAX_MAXCHAR(*maxchar,
8753 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008754 }
8755 }
8756 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757}
8758
8759
Thomas Wouters477c8d52006-05-27 19:21:47 +00008760/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008761#define ADJUST_INDICES(start, end, len) \
8762 if (end > len) \
8763 end = len; \
8764 else if (end < 0) { \
8765 end += len; \
8766 if (end < 0) \
8767 end = 0; \
8768 } \
8769 if (start < 0) { \
8770 start += len; \
8771 if (start < 0) \
8772 start = 0; \
8773 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008774
Alexander Belopolsky40018472011-02-26 01:02:56 +00008775Py_ssize_t
8776PyUnicode_Count(PyObject *str,
8777 PyObject *substr,
8778 Py_ssize_t start,
8779 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008781 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008782 PyObject* str_obj;
8783 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784 int kind1, kind2, kind;
8785 void *buf1 = NULL, *buf2 = NULL;
8786 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008787
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008788 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008789 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008791 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008792 if (!sub_obj) {
8793 Py_DECREF(str_obj);
8794 return -1;
8795 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008796 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008797 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 Py_DECREF(str_obj);
8799 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 }
Tim Petersced69f82003-09-16 20:30:58 +00008801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802 kind1 = PyUnicode_KIND(str_obj);
8803 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008804 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008807 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008808 if (kind2 > kind) {
8809 Py_DECREF(sub_obj);
8810 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008811 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008812 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008813 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008814 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815 if (!buf2)
8816 goto onError;
8817 len1 = PyUnicode_GET_LENGTH(str_obj);
8818 len2 = PyUnicode_GET_LENGTH(sub_obj);
8819
8820 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008821 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008823 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8824 result = asciilib_count(
8825 ((Py_UCS1*)buf1) + start, end - start,
8826 buf2, len2, PY_SSIZE_T_MAX
8827 );
8828 else
8829 result = ucs1lib_count(
8830 ((Py_UCS1*)buf1) + start, end - start,
8831 buf2, len2, PY_SSIZE_T_MAX
8832 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833 break;
8834 case PyUnicode_2BYTE_KIND:
8835 result = ucs2lib_count(
8836 ((Py_UCS2*)buf1) + start, end - start,
8837 buf2, len2, PY_SSIZE_T_MAX
8838 );
8839 break;
8840 case PyUnicode_4BYTE_KIND:
8841 result = ucs4lib_count(
8842 ((Py_UCS4*)buf1) + start, end - start,
8843 buf2, len2, PY_SSIZE_T_MAX
8844 );
8845 break;
8846 default:
8847 assert(0); result = 0;
8848 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008849
8850 Py_DECREF(sub_obj);
8851 Py_DECREF(str_obj);
8852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853 if (kind2 != kind)
8854 PyMem_Free(buf2);
8855
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 onError:
8858 Py_DECREF(sub_obj);
8859 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008860 if (kind2 != kind && buf2)
8861 PyMem_Free(buf2);
8862 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863}
8864
Alexander Belopolsky40018472011-02-26 01:02:56 +00008865Py_ssize_t
8866PyUnicode_Find(PyObject *str,
8867 PyObject *sub,
8868 Py_ssize_t start,
8869 Py_ssize_t end,
8870 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008872 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008873
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008875 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008877 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008878 if (!sub) {
8879 Py_DECREF(str);
8880 return -2;
8881 }
8882 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8883 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 Py_DECREF(str);
8885 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886 }
Tim Petersced69f82003-09-16 20:30:58 +00008887
Victor Stinner794d5672011-10-10 03:21:36 +02008888 result = any_find_slice(direction,
8889 str, sub, start, end
8890 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008891
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008893 Py_DECREF(sub);
8894
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895 return result;
8896}
8897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898Py_ssize_t
8899PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8900 Py_ssize_t start, Py_ssize_t end,
8901 int direction)
8902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008904 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905 if (PyUnicode_READY(str) == -1)
8906 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008907 if (start < 0 || end < 0) {
8908 PyErr_SetString(PyExc_IndexError, "string index out of range");
8909 return -2;
8910 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 if (end > PyUnicode_GET_LENGTH(str))
8912 end = PyUnicode_GET_LENGTH(str);
8913 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008914 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8915 kind, end-start, ch, direction);
8916 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008918 else
8919 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920}
8921
Alexander Belopolsky40018472011-02-26 01:02:56 +00008922static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008923tailmatch(PyObject *self,
8924 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008925 Py_ssize_t start,
8926 Py_ssize_t end,
8927 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929 int kind_self;
8930 int kind_sub;
8931 void *data_self;
8932 void *data_sub;
8933 Py_ssize_t offset;
8934 Py_ssize_t i;
8935 Py_ssize_t end_sub;
8936
8937 if (PyUnicode_READY(self) == -1 ||
8938 PyUnicode_READY(substring) == -1)
8939 return 0;
8940
8941 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 return 1;
8943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8945 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008947 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 kind_self = PyUnicode_KIND(self);
8950 data_self = PyUnicode_DATA(self);
8951 kind_sub = PyUnicode_KIND(substring);
8952 data_sub = PyUnicode_DATA(substring);
8953 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8954
8955 if (direction > 0)
8956 offset = end;
8957 else
8958 offset = start;
8959
8960 if (PyUnicode_READ(kind_self, data_self, offset) ==
8961 PyUnicode_READ(kind_sub, data_sub, 0) &&
8962 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8963 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8964 /* If both are of the same kind, memcmp is sufficient */
8965 if (kind_self == kind_sub) {
8966 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008967 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968 data_sub,
8969 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008970 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 }
8972 /* otherwise we have to compare each character by first accesing it */
8973 else {
8974 /* We do not need to compare 0 and len(substring)-1 because
8975 the if statement above ensured already that they are equal
8976 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02008977 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 for (i = 1; i < end_sub; ++i) {
8979 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8980 PyUnicode_READ(kind_sub, data_sub, i))
8981 return 0;
8982 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985 }
8986
8987 return 0;
8988}
8989
Alexander Belopolsky40018472011-02-26 01:02:56 +00008990Py_ssize_t
8991PyUnicode_Tailmatch(PyObject *str,
8992 PyObject *substr,
8993 Py_ssize_t start,
8994 Py_ssize_t end,
8995 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008997 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008998
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999 str = PyUnicode_FromObject(str);
9000 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002 substr = PyUnicode_FromObject(substr);
9003 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 Py_DECREF(str);
9005 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006 }
Tim Petersced69f82003-09-16 20:30:58 +00009007
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009008 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010 Py_DECREF(str);
9011 Py_DECREF(substr);
9012 return result;
9013}
9014
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015/* Apply fixfct filter to the Unicode object self and return a
9016 reference to the modified object */
9017
Alexander Belopolsky40018472011-02-26 01:02:56 +00009018static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009019fixup(PyObject *self,
9020 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 PyObject *u;
9023 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009024 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009026 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009028 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009029 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 /* fix functions return the new maximum character in a string,
9032 if the kind of the resulting unicode object does not change,
9033 everything is fine. Otherwise we need to change the string kind
9034 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009035 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009036
9037 if (maxchar_new == 0) {
9038 /* no changes */;
9039 if (PyUnicode_CheckExact(self)) {
9040 Py_DECREF(u);
9041 Py_INCREF(self);
9042 return self;
9043 }
9044 else
9045 return u;
9046 }
9047
Victor Stinnere6abb482012-05-02 01:15:40 +02009048 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049
Victor Stinnereaab6042011-12-11 22:22:39 +01009050 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009052
9053 /* In case the maximum character changed, we need to
9054 convert the string to the new category. */
9055 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9056 if (v == NULL) {
9057 Py_DECREF(u);
9058 return NULL;
9059 }
9060 if (maxchar_new > maxchar_old) {
9061 /* If the maxchar increased so that the kind changed, not all
9062 characters are representable anymore and we need to fix the
9063 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009064 _PyUnicode_FastCopyCharacters(v, 0,
9065 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009066 maxchar_old = fixfct(v);
9067 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 }
9069 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009070 _PyUnicode_FastCopyCharacters(v, 0,
9071 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009073 Py_DECREF(u);
9074 assert(_PyUnicode_CheckConsistency(v, 1));
9075 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076}
9077
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009078static PyObject *
9079ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009081 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9082 char *resdata, *data = PyUnicode_DATA(self);
9083 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009084
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009085 res = PyUnicode_New(len, 127);
9086 if (res == NULL)
9087 return NULL;
9088 resdata = PyUnicode_DATA(res);
9089 if (lower)
9090 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009092 _Py_bytes_upper(resdata, data, len);
9093 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094}
9095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009097handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009099 Py_ssize_t j;
9100 int final_sigma;
9101 Py_UCS4 c;
9102 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009103
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009104 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9105
9106 where ! is a negation and \p{xxx} is a character with property xxx.
9107 */
9108 for (j = i - 1; j >= 0; j--) {
9109 c = PyUnicode_READ(kind, data, j);
9110 if (!_PyUnicode_IsCaseIgnorable(c))
9111 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009113 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9114 if (final_sigma) {
9115 for (j = i + 1; j < length; j++) {
9116 c = PyUnicode_READ(kind, data, j);
9117 if (!_PyUnicode_IsCaseIgnorable(c))
9118 break;
9119 }
9120 final_sigma = j == length || !_PyUnicode_IsCased(c);
9121 }
9122 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123}
9124
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009125static int
9126lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9127 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009129 /* Obscure special case. */
9130 if (c == 0x3A3) {
9131 mapped[0] = handle_capital_sigma(kind, data, length, i);
9132 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009134 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135}
9136
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009137static Py_ssize_t
9138do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009140 Py_ssize_t i, k = 0;
9141 int n_res, j;
9142 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009143
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009144 c = PyUnicode_READ(kind, data, 0);
9145 n_res = _PyUnicode_ToUpperFull(c, mapped);
9146 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009147 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009148 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009150 for (i = 1; i < length; i++) {
9151 c = PyUnicode_READ(kind, data, i);
9152 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9153 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009154 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009155 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009156 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009157 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009158 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009159}
9160
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009161static Py_ssize_t
9162do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9163 Py_ssize_t i, k = 0;
9164
9165 for (i = 0; i < length; i++) {
9166 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9167 int n_res, j;
9168 if (Py_UNICODE_ISUPPER(c)) {
9169 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9170 }
9171 else if (Py_UNICODE_ISLOWER(c)) {
9172 n_res = _PyUnicode_ToUpperFull(c, mapped);
9173 }
9174 else {
9175 n_res = 1;
9176 mapped[0] = c;
9177 }
9178 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009179 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009180 res[k++] = mapped[j];
9181 }
9182 }
9183 return k;
9184}
9185
9186static Py_ssize_t
9187do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9188 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009189{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009190 Py_ssize_t i, k = 0;
9191
9192 for (i = 0; i < length; i++) {
9193 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9194 int n_res, j;
9195 if (lower)
9196 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9197 else
9198 n_res = _PyUnicode_ToUpperFull(c, mapped);
9199 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009200 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009201 res[k++] = mapped[j];
9202 }
9203 }
9204 return k;
9205}
9206
9207static Py_ssize_t
9208do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9209{
9210 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9211}
9212
9213static Py_ssize_t
9214do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9215{
9216 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9217}
9218
Benjamin Petersone51757f2012-01-12 21:10:29 -05009219static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009220do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9221{
9222 Py_ssize_t i, k = 0;
9223
9224 for (i = 0; i < length; i++) {
9225 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9226 Py_UCS4 mapped[3];
9227 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9228 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009229 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009230 res[k++] = mapped[j];
9231 }
9232 }
9233 return k;
9234}
9235
9236static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009237do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9238{
9239 Py_ssize_t i, k = 0;
9240 int previous_is_cased;
9241
9242 previous_is_cased = 0;
9243 for (i = 0; i < length; i++) {
9244 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9245 Py_UCS4 mapped[3];
9246 int n_res, j;
9247
9248 if (previous_is_cased)
9249 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9250 else
9251 n_res = _PyUnicode_ToTitleFull(c, mapped);
9252
9253 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009254 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009255 res[k++] = mapped[j];
9256 }
9257
9258 previous_is_cased = _PyUnicode_IsCased(c);
9259 }
9260 return k;
9261}
9262
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009263static PyObject *
9264case_operation(PyObject *self,
9265 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9266{
9267 PyObject *res = NULL;
9268 Py_ssize_t length, newlength = 0;
9269 int kind, outkind;
9270 void *data, *outdata;
9271 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9272
Benjamin Petersoneea48462012-01-16 14:28:50 -05009273 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009274
9275 kind = PyUnicode_KIND(self);
9276 data = PyUnicode_DATA(self);
9277 length = PyUnicode_GET_LENGTH(self);
9278 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9279 if (tmp == NULL)
9280 return PyErr_NoMemory();
9281 newlength = perform(kind, data, length, tmp, &maxchar);
9282 res = PyUnicode_New(newlength, maxchar);
9283 if (res == NULL)
9284 goto leave;
9285 tmpend = tmp + newlength;
9286 outdata = PyUnicode_DATA(res);
9287 outkind = PyUnicode_KIND(res);
9288 switch (outkind) {
9289 case PyUnicode_1BYTE_KIND:
9290 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9291 break;
9292 case PyUnicode_2BYTE_KIND:
9293 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9294 break;
9295 case PyUnicode_4BYTE_KIND:
9296 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9297 break;
9298 default:
9299 assert(0);
9300 break;
9301 }
9302 leave:
9303 PyMem_FREE(tmp);
9304 return res;
9305}
9306
Tim Peters8ce9f162004-08-27 01:49:32 +00009307PyObject *
9308PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009311 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009313 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009314 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9315 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009316 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009318 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009320 int use_memcpy;
9321 unsigned char *res_data = NULL, *sep_data = NULL;
9322 PyObject *last_obj;
9323 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324
Tim Peters05eba1f2004-08-27 21:32:02 +00009325 fseq = PySequence_Fast(seq, "");
9326 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009327 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009328 }
9329
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009330 /* NOTE: the following code can't call back into Python code,
9331 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009332 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009333
Tim Peters05eba1f2004-08-27 21:32:02 +00009334 seqlen = PySequence_Fast_GET_SIZE(fseq);
9335 /* If empty sequence, return u"". */
9336 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009337 Py_DECREF(fseq);
9338 Py_INCREF(unicode_empty);
9339 res = unicode_empty;
9340 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009341 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009342
Tim Peters05eba1f2004-08-27 21:32:02 +00009343 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009344 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009345 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009346 if (seqlen == 1) {
9347 if (PyUnicode_CheckExact(items[0])) {
9348 res = items[0];
9349 Py_INCREF(res);
9350 Py_DECREF(fseq);
9351 return res;
9352 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009353 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009354 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009355 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009356 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009357 /* Set up sep and seplen */
9358 if (separator == NULL) {
9359 /* fall back to a blank space separator */
9360 sep = PyUnicode_FromOrdinal(' ');
9361 if (!sep)
9362 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009363 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009364 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009365 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009366 else {
9367 if (!PyUnicode_Check(separator)) {
9368 PyErr_Format(PyExc_TypeError,
9369 "separator: expected str instance,"
9370 " %.80s found",
9371 Py_TYPE(separator)->tp_name);
9372 goto onError;
9373 }
9374 if (PyUnicode_READY(separator))
9375 goto onError;
9376 sep = separator;
9377 seplen = PyUnicode_GET_LENGTH(separator);
9378 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9379 /* inc refcount to keep this code path symmetric with the
9380 above case of a blank separator */
9381 Py_INCREF(sep);
9382 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009383 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009384 }
9385
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009386 /* There are at least two things to join, or else we have a subclass
9387 * of str in the sequence.
9388 * Do a pre-pass to figure out the total amount of space we'll
9389 * need (sz), and see whether all argument are strings.
9390 */
9391 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009392#ifdef Py_DEBUG
9393 use_memcpy = 0;
9394#else
9395 use_memcpy = 1;
9396#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009397 for (i = 0; i < seqlen; i++) {
9398 const Py_ssize_t old_sz = sz;
9399 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009400 if (!PyUnicode_Check(item)) {
9401 PyErr_Format(PyExc_TypeError,
9402 "sequence item %zd: expected str instance,"
9403 " %.80s found",
9404 i, Py_TYPE(item)->tp_name);
9405 goto onError;
9406 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009407 if (PyUnicode_READY(item) == -1)
9408 goto onError;
9409 sz += PyUnicode_GET_LENGTH(item);
9410 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009411 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009412 if (i != 0)
9413 sz += seplen;
9414 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9415 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009417 goto onError;
9418 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009419 if (use_memcpy && last_obj != NULL) {
9420 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9421 use_memcpy = 0;
9422 }
9423 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009424 }
Tim Petersced69f82003-09-16 20:30:58 +00009425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009427 if (res == NULL)
9428 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009429
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009430 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009431#ifdef Py_DEBUG
9432 use_memcpy = 0;
9433#else
9434 if (use_memcpy) {
9435 res_data = PyUnicode_1BYTE_DATA(res);
9436 kind = PyUnicode_KIND(res);
9437 if (seplen != 0)
9438 sep_data = PyUnicode_1BYTE_DATA(sep);
9439 }
9440#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009442 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009443 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009444 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009445 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009446 if (use_memcpy) {
9447 Py_MEMCPY(res_data,
9448 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009449 kind * seplen);
9450 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009451 }
9452 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009453 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009454 res_offset += seplen;
9455 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009456 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009457 itemlen = PyUnicode_GET_LENGTH(item);
9458 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009459 if (use_memcpy) {
9460 Py_MEMCPY(res_data,
9461 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009462 kind * itemlen);
9463 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009464 }
9465 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009466 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009467 res_offset += itemlen;
9468 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009469 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009470 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009471 if (use_memcpy)
9472 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009473 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009474 else
9475 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009476
Tim Peters05eba1f2004-08-27 21:32:02 +00009477 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009479 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009483 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009485 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486 return NULL;
9487}
9488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489#define FILL(kind, data, value, start, length) \
9490 do { \
9491 Py_ssize_t i_ = 0; \
9492 assert(kind != PyUnicode_WCHAR_KIND); \
9493 switch ((kind)) { \
9494 case PyUnicode_1BYTE_KIND: { \
9495 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009496 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 break; \
9498 } \
9499 case PyUnicode_2BYTE_KIND: { \
9500 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9501 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9502 break; \
9503 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009504 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9506 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9507 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009508 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 } \
9510 } \
9511 } while (0)
9512
Victor Stinnerd3f08822012-05-29 12:57:52 +02009513void
9514_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9515 Py_UCS4 fill_char)
9516{
9517 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9518 const void *data = PyUnicode_DATA(unicode);
9519 assert(PyUnicode_IS_READY(unicode));
9520 assert(unicode_modifiable(unicode));
9521 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9522 assert(start >= 0);
9523 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9524 FILL(kind, data, fill_char, start, length);
9525}
9526
Victor Stinner3fe55312012-01-04 00:33:50 +01009527Py_ssize_t
9528PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9529 Py_UCS4 fill_char)
9530{
9531 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009532
9533 if (!PyUnicode_Check(unicode)) {
9534 PyErr_BadInternalCall();
9535 return -1;
9536 }
9537 if (PyUnicode_READY(unicode) == -1)
9538 return -1;
9539 if (unicode_check_modifiable(unicode))
9540 return -1;
9541
Victor Stinnerd3f08822012-05-29 12:57:52 +02009542 if (start < 0) {
9543 PyErr_SetString(PyExc_IndexError, "string index out of range");
9544 return -1;
9545 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009546 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9547 PyErr_SetString(PyExc_ValueError,
9548 "fill character is bigger than "
9549 "the string maximum character");
9550 return -1;
9551 }
9552
9553 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9554 length = Py_MIN(maxlen, length);
9555 if (length <= 0)
9556 return 0;
9557
Victor Stinnerd3f08822012-05-29 12:57:52 +02009558 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009559 return length;
9560}
9561
Victor Stinner9310abb2011-10-05 00:59:23 +02009562static PyObject *
9563pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009564 Py_ssize_t left,
9565 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 PyObject *u;
9569 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009570 int kind;
9571 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572
9573 if (left < 0)
9574 left = 0;
9575 if (right < 0)
9576 right = 0;
9577
Victor Stinnerc4b49542011-12-11 22:44:26 +01009578 if (left == 0 && right == 0)
9579 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9582 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009583 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9584 return NULL;
9585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009587 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009589 if (!u)
9590 return NULL;
9591
9592 kind = PyUnicode_KIND(u);
9593 data = PyUnicode_DATA(u);
9594 if (left)
9595 FILL(kind, data, fill, 0, left);
9596 if (right)
9597 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009598 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009599 assert(_PyUnicode_CheckConsistency(u, 1));
9600 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009601}
9602
Alexander Belopolsky40018472011-02-26 01:02:56 +00009603PyObject *
9604PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009607
9608 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009609 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009610 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009611 if (PyUnicode_READY(string) == -1) {
9612 Py_DECREF(string);
9613 return NULL;
9614 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009615
Benjamin Petersonead6b532011-12-20 17:23:42 -06009616 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009618 if (PyUnicode_IS_ASCII(string))
9619 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009620 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009621 PyUnicode_GET_LENGTH(string), keepends);
9622 else
9623 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009624 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009625 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 break;
9627 case PyUnicode_2BYTE_KIND:
9628 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009629 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 PyUnicode_GET_LENGTH(string), keepends);
9631 break;
9632 case PyUnicode_4BYTE_KIND:
9633 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009634 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 PyUnicode_GET_LENGTH(string), keepends);
9636 break;
9637 default:
9638 assert(0);
9639 list = 0;
9640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641 Py_DECREF(string);
9642 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643}
9644
Alexander Belopolsky40018472011-02-26 01:02:56 +00009645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009646split(PyObject *self,
9647 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009648 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 int kind1, kind2, kind;
9651 void *buf1, *buf2;
9652 Py_ssize_t len1, len2;
9653 PyObject* out;
9654
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009656 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658 if (PyUnicode_READY(self) == -1)
9659 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009662 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009664 if (PyUnicode_IS_ASCII(self))
9665 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009666 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009667 PyUnicode_GET_LENGTH(self), maxcount
9668 );
9669 else
9670 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009671 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009672 PyUnicode_GET_LENGTH(self), maxcount
9673 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 case PyUnicode_2BYTE_KIND:
9675 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009676 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 PyUnicode_GET_LENGTH(self), maxcount
9678 );
9679 case PyUnicode_4BYTE_KIND:
9680 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009681 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 PyUnicode_GET_LENGTH(self), maxcount
9683 );
9684 default:
9685 assert(0);
9686 return NULL;
9687 }
9688
9689 if (PyUnicode_READY(substring) == -1)
9690 return NULL;
9691
9692 kind1 = PyUnicode_KIND(self);
9693 kind2 = PyUnicode_KIND(substring);
9694 kind = kind1 > kind2 ? kind1 : kind2;
9695 buf1 = PyUnicode_DATA(self);
9696 buf2 = PyUnicode_DATA(substring);
9697 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009698 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 if (!buf1)
9700 return NULL;
9701 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009702 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 if (!buf2) {
9704 if (kind1 != kind) PyMem_Free(buf1);
9705 return NULL;
9706 }
9707 len1 = PyUnicode_GET_LENGTH(self);
9708 len2 = PyUnicode_GET_LENGTH(substring);
9709
Benjamin Petersonead6b532011-12-20 17:23:42 -06009710 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009712 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9713 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009714 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009715 else
9716 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009717 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 break;
9719 case PyUnicode_2BYTE_KIND:
9720 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009721 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 break;
9723 case PyUnicode_4BYTE_KIND:
9724 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009725 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 break;
9727 default:
9728 out = NULL;
9729 }
9730 if (kind1 != kind)
9731 PyMem_Free(buf1);
9732 if (kind2 != kind)
9733 PyMem_Free(buf2);
9734 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735}
9736
Alexander Belopolsky40018472011-02-26 01:02:56 +00009737static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009738rsplit(PyObject *self,
9739 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009740 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009741{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 int kind1, kind2, kind;
9743 void *buf1, *buf2;
9744 Py_ssize_t len1, len2;
9745 PyObject* out;
9746
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009747 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009748 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 if (PyUnicode_READY(self) == -1)
9751 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009753 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009754 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009756 if (PyUnicode_IS_ASCII(self))
9757 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009758 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009759 PyUnicode_GET_LENGTH(self), maxcount
9760 );
9761 else
9762 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009763 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009764 PyUnicode_GET_LENGTH(self), maxcount
9765 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009766 case PyUnicode_2BYTE_KIND:
9767 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009768 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 PyUnicode_GET_LENGTH(self), maxcount
9770 );
9771 case PyUnicode_4BYTE_KIND:
9772 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009773 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 PyUnicode_GET_LENGTH(self), maxcount
9775 );
9776 default:
9777 assert(0);
9778 return NULL;
9779 }
9780
9781 if (PyUnicode_READY(substring) == -1)
9782 return NULL;
9783
9784 kind1 = PyUnicode_KIND(self);
9785 kind2 = PyUnicode_KIND(substring);
9786 kind = kind1 > kind2 ? kind1 : kind2;
9787 buf1 = PyUnicode_DATA(self);
9788 buf2 = PyUnicode_DATA(substring);
9789 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009790 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 if (!buf1)
9792 return NULL;
9793 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009794 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795 if (!buf2) {
9796 if (kind1 != kind) PyMem_Free(buf1);
9797 return NULL;
9798 }
9799 len1 = PyUnicode_GET_LENGTH(self);
9800 len2 = PyUnicode_GET_LENGTH(substring);
9801
Benjamin Petersonead6b532011-12-20 17:23:42 -06009802 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009803 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009804 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9805 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009806 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009807 else
9808 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009809 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 break;
9811 case PyUnicode_2BYTE_KIND:
9812 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009813 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814 break;
9815 case PyUnicode_4BYTE_KIND:
9816 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009817 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 break;
9819 default:
9820 out = NULL;
9821 }
9822 if (kind1 != kind)
9823 PyMem_Free(buf1);
9824 if (kind2 != kind)
9825 PyMem_Free(buf2);
9826 return out;
9827}
9828
9829static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009830anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9831 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009833 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009835 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9836 return asciilib_find(buf1, len1, buf2, len2, offset);
9837 else
9838 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 case PyUnicode_2BYTE_KIND:
9840 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9841 case PyUnicode_4BYTE_KIND:
9842 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9843 }
9844 assert(0);
9845 return -1;
9846}
9847
9848static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009849anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9850 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009852 switch (kind) {
9853 case PyUnicode_1BYTE_KIND:
9854 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9855 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9856 else
9857 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9858 case PyUnicode_2BYTE_KIND:
9859 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9860 case PyUnicode_4BYTE_KIND:
9861 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9862 }
9863 assert(0);
9864 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009865}
9866
Alexander Belopolsky40018472011-02-26 01:02:56 +00009867static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868replace(PyObject *self, PyObject *str1,
9869 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 PyObject *u;
9872 char *sbuf = PyUnicode_DATA(self);
9873 char *buf1 = PyUnicode_DATA(str1);
9874 char *buf2 = PyUnicode_DATA(str2);
9875 int srelease = 0, release1 = 0, release2 = 0;
9876 int skind = PyUnicode_KIND(self);
9877 int kind1 = PyUnicode_KIND(str1);
9878 int kind2 = PyUnicode_KIND(str2);
9879 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9880 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9881 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009882 int mayshrink;
9883 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884
9885 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009886 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009888 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889
Victor Stinner59de0ee2011-10-07 10:01:28 +02009890 if (str1 == str2)
9891 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 if (skind < kind1)
9893 /* substring too wide to be present */
9894 goto nothing;
9895
Victor Stinner49a0a212011-10-12 23:46:10 +02009896 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9897 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9898 /* Replacing str1 with str2 may cause a maxchar reduction in the
9899 result string. */
9900 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +02009901 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009904 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009906 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009908 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009909 Py_UCS4 u1, u2;
9910 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +01009911 Py_ssize_t index, pos;
9912 char *src;
9913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +01009915 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9916 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009917 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009920 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +02009922 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +01009924
9925 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9926 index = 0;
9927 src = sbuf;
9928 while (--maxcount)
9929 {
9930 pos++;
9931 src += pos * PyUnicode_KIND(self);
9932 slen -= pos;
9933 index += pos;
9934 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9935 if (pos < 0)
9936 break;
9937 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9938 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009939 }
9940 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 int rkind = skind;
9942 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009943 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 if (kind1 < rkind) {
9946 /* widen substring */
9947 buf1 = _PyUnicode_AsKind(str1, rkind);
9948 if (!buf1) goto error;
9949 release1 = 1;
9950 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009951 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009952 if (i < 0)
9953 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 if (rkind > kind2) {
9955 /* widen replacement */
9956 buf2 = _PyUnicode_AsKind(str2, rkind);
9957 if (!buf2) goto error;
9958 release2 = 1;
9959 }
9960 else if (rkind < kind2) {
9961 /* widen self and buf1 */
9962 rkind = kind2;
9963 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +01009964 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 sbuf = _PyUnicode_AsKind(self, rkind);
9966 if (!sbuf) goto error;
9967 srelease = 1;
9968 buf1 = _PyUnicode_AsKind(str1, rkind);
9969 if (!buf1) goto error;
9970 release1 = 1;
9971 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009972 u = PyUnicode_New(slen, maxchar);
9973 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009975 assert(PyUnicode_KIND(u) == rkind);
9976 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009977
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009978 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009979 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009980 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009982 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009984
9985 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009986 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009987 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009988 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009989 if (i == -1)
9990 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009991 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009993 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009995 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009997 }
9998 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010000 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 int rkind = skind;
10002 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010005 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 buf1 = _PyUnicode_AsKind(str1, rkind);
10007 if (!buf1) goto error;
10008 release1 = 1;
10009 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010010 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010011 if (n == 0)
10012 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010014 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 buf2 = _PyUnicode_AsKind(str2, rkind);
10016 if (!buf2) goto error;
10017 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010020 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 rkind = kind2;
10022 sbuf = _PyUnicode_AsKind(self, rkind);
10023 if (!sbuf) goto error;
10024 srelease = 1;
10025 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010026 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 buf1 = _PyUnicode_AsKind(str1, rkind);
10028 if (!buf1) goto error;
10029 release1 = 1;
10030 }
10031 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10032 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010033 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 PyErr_SetString(PyExc_OverflowError,
10035 "replace string is too long");
10036 goto error;
10037 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010038 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010039 if (new_size == 0) {
10040 Py_INCREF(unicode_empty);
10041 u = unicode_empty;
10042 goto done;
10043 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010044 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 PyErr_SetString(PyExc_OverflowError,
10046 "replace string is too long");
10047 goto error;
10048 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010049 u = PyUnicode_New(new_size, maxchar);
10050 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010052 assert(PyUnicode_KIND(u) == rkind);
10053 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 ires = i = 0;
10055 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010056 while (n-- > 0) {
10057 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010058 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010059 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010060 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010061 if (j == -1)
10062 break;
10063 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010064 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010065 memcpy(res + rkind * ires,
10066 sbuf + rkind * i,
10067 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010069 }
10070 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010072 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010074 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010076 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010080 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010081 memcpy(res + rkind * ires,
10082 sbuf + rkind * i,
10083 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010084 }
10085 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010086 /* interleave */
10087 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010088 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010090 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010092 if (--n <= 0)
10093 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010094 memcpy(res + rkind * ires,
10095 sbuf + rkind * i,
10096 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 ires++;
10098 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010099 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010100 memcpy(res + rkind * ires,
10101 sbuf + rkind * i,
10102 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010103 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010104 }
10105
10106 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010107 unicode_adjust_maxchar(&u);
10108 if (u == NULL)
10109 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010111
10112 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 if (srelease)
10114 PyMem_FREE(sbuf);
10115 if (release1)
10116 PyMem_FREE(buf1);
10117 if (release2)
10118 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010119 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010121
Benjamin Peterson29060642009-01-31 22:14:21 +000010122 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010123 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 if (srelease)
10125 PyMem_FREE(sbuf);
10126 if (release1)
10127 PyMem_FREE(buf1);
10128 if (release2)
10129 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010130 return unicode_result_unchanged(self);
10131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 error:
10133 if (srelease && sbuf)
10134 PyMem_FREE(sbuf);
10135 if (release1 && buf1)
10136 PyMem_FREE(buf1);
10137 if (release2 && buf2)
10138 PyMem_FREE(buf2);
10139 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140}
10141
10142/* --- Unicode Object Methods --------------------------------------------- */
10143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010144PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010145 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146\n\
10147Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010148characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149
10150static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010151unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010153 if (PyUnicode_READY(self) == -1)
10154 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010155 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156}
10157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010158PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010159 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160\n\
10161Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010162have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163
10164static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010165unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010167 if (PyUnicode_READY(self) == -1)
10168 return NULL;
10169 if (PyUnicode_GET_LENGTH(self) == 0)
10170 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010171 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172}
10173
Benjamin Petersond5890c82012-01-14 13:23:30 -050010174PyDoc_STRVAR(casefold__doc__,
10175 "S.casefold() -> str\n\
10176\n\
10177Return a version of S suitable for caseless comparisons.");
10178
10179static PyObject *
10180unicode_casefold(PyObject *self)
10181{
10182 if (PyUnicode_READY(self) == -1)
10183 return NULL;
10184 if (PyUnicode_IS_ASCII(self))
10185 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010186 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010187}
10188
10189
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010190/* Argument converter. Coerces to a single unicode character */
10191
10192static int
10193convert_uc(PyObject *obj, void *addr)
10194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010196 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010197
Benjamin Peterson14339b62009-01-31 16:36:08 +000010198 uniobj = PyUnicode_FromObject(obj);
10199 if (uniobj == NULL) {
10200 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010201 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010202 return 0;
10203 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010205 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010206 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010207 Py_DECREF(uniobj);
10208 return 0;
10209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010211 Py_DECREF(uniobj);
10212 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010213}
10214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010215PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010216 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010218Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010219done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220
10221static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010222unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010224 Py_ssize_t marg, left;
10225 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 Py_UCS4 fillchar = ' ';
10227
Victor Stinnere9a29352011-10-01 02:14:59 +020010228 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230
Benjamin Petersonbac79492012-01-14 13:34:47 -050010231 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232 return NULL;
10233
Victor Stinnerc4b49542011-12-11 22:44:26 +010010234 if (PyUnicode_GET_LENGTH(self) >= width)
10235 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236
Victor Stinnerc4b49542011-12-11 22:44:26 +010010237 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238 left = marg / 2 + (marg & width & 1);
10239
Victor Stinner9310abb2011-10-05 00:59:23 +020010240 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241}
10242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243/* This function assumes that str1 and str2 are readied by the caller. */
10244
Marc-André Lemburge5034372000-08-08 08:04:29 +000010245static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010246unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010247{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 int kind1, kind2;
10249 void *data1, *data2;
Victor Stinner770e19e2012-10-04 22:59:45 +020010250 Py_ssize_t len1, len2;
10251 Py_ssize_t i, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010252
Victor Stinner90db9c42012-10-04 21:53:50 +020010253 /* a string is equal to itself */
10254 if (str1 == str2)
10255 return 0;
10256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 kind1 = PyUnicode_KIND(str1);
10258 kind2 = PyUnicode_KIND(str2);
10259 data1 = PyUnicode_DATA(str1);
10260 data2 = PyUnicode_DATA(str2);
10261 len1 = PyUnicode_GET_LENGTH(str1);
10262 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010263 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010264
Victor Stinner770e19e2012-10-04 22:59:45 +020010265 if (kind1 == 1 && kind2 == 1) {
10266 int cmp = memcmp(data1, data2, len);
10267 /* normalize result of memcmp() into the range [-1; 1] */
10268 if (cmp < 0)
10269 return -1;
10270 if (cmp > 0)
10271 return 1;
10272 }
10273 else {
10274 for (i = 0; i < len; ++i) {
10275 Py_UCS4 c1, c2;
10276 c1 = PyUnicode_READ(kind1, data1, i);
10277 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010278
Victor Stinner770e19e2012-10-04 22:59:45 +020010279 if (c1 != c2)
10280 return (c1 < c2) ? -1 : 1;
10281 }
Marc-André Lemburge5034372000-08-08 08:04:29 +000010282 }
10283
Victor Stinner770e19e2012-10-04 22:59:45 +020010284 if (len1 == len2)
10285 return 0;
10286 if (len1 < len2)
10287 return -1;
10288 else
10289 return 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010290}
10291
Victor Stinnere5567ad2012-10-23 02:48:49 +020010292static int
10293unicode_compare_eq(PyObject *str1, PyObject *str2)
10294{
10295 int kind;
10296 void *data1, *data2;
10297 Py_ssize_t len;
10298 int cmp;
10299
10300 /* a string is equal to itself */
10301 if (str1 == str2)
10302 return 1;
10303
10304 len = PyUnicode_GET_LENGTH(str1);
10305 if (PyUnicode_GET_LENGTH(str2) != len)
10306 return 0;
10307 kind = PyUnicode_KIND(str1);
10308 if (PyUnicode_KIND(str2) != kind)
10309 return 0;
10310 data1 = PyUnicode_DATA(str1);
10311 data2 = PyUnicode_DATA(str2);
10312
10313 cmp = memcmp(data1, data2, len * kind);
10314 return (cmp == 0);
10315}
10316
10317
Alexander Belopolsky40018472011-02-26 01:02:56 +000010318int
10319PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10322 if (PyUnicode_READY(left) == -1 ||
10323 PyUnicode_READY(right) == -1)
10324 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010325 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010327 PyErr_Format(PyExc_TypeError,
10328 "Can't compare %.100s and %.100s",
10329 left->ob_type->tp_name,
10330 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331 return -1;
10332}
10333
Martin v. Löwis5b222132007-06-10 09:51:05 +000010334int
10335PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 Py_ssize_t i;
10338 int kind;
10339 void *data;
10340 Py_UCS4 chr;
10341
Victor Stinner910337b2011-10-03 03:20:16 +020010342 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 if (PyUnicode_READY(uni) == -1)
10344 return -1;
10345 kind = PyUnicode_KIND(uni);
10346 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010347 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10349 if (chr != str[i])
10350 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010351 /* This check keeps Python strings that end in '\0' from comparing equal
10352 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010354 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010355 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010356 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010357 return 0;
10358}
10359
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010360
Benjamin Peterson29060642009-01-31 22:14:21 +000010361#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010362 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010363
Alexander Belopolsky40018472011-02-26 01:02:56 +000010364PyObject *
10365PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010366{
10367 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010368 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010369
Victor Stinnere5567ad2012-10-23 02:48:49 +020010370 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10371 Py_RETURN_NOTIMPLEMENTED;
10372
10373 if (PyUnicode_READY(left) == -1 ||
10374 PyUnicode_READY(right) == -1)
10375 return NULL;
10376
10377 if (op == Py_EQ || op == Py_NE) {
10378 result = unicode_compare_eq(left, right);
10379 if (op == Py_EQ)
10380 v = TEST_COND(result);
10381 else
10382 v = TEST_COND(!result);
10383 }
10384 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010385 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010386
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010387 /* Convert the return value to a Boolean */
10388 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010389 case Py_LE:
10390 v = TEST_COND(result <= 0);
10391 break;
10392 case Py_GE:
10393 v = TEST_COND(result >= 0);
10394 break;
10395 case Py_LT:
10396 v = TEST_COND(result == -1);
10397 break;
10398 case Py_GT:
10399 v = TEST_COND(result == 1);
10400 break;
10401 default:
10402 PyErr_BadArgument();
10403 return NULL;
10404 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010405 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010406 Py_INCREF(v);
10407 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010408}
10409
Alexander Belopolsky40018472011-02-26 01:02:56 +000010410int
10411PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010412{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010413 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 int kind1, kind2, kind;
10415 void *buf1, *buf2;
10416 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010417 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010418
10419 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010420 sub = PyUnicode_FromObject(element);
10421 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010422 PyErr_Format(PyExc_TypeError,
10423 "'in <string>' requires string as left operand, not %s",
10424 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010425 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010426 }
10427
Thomas Wouters477c8d52006-05-27 19:21:47 +000010428 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010429 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010430 Py_DECREF(sub);
10431 return -1;
10432 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010433 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10434 Py_DECREF(sub);
10435 Py_DECREF(str);
10436 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 kind1 = PyUnicode_KIND(str);
10439 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010440 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 buf1 = PyUnicode_DATA(str);
10442 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010443 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010444 if (kind2 > kind) {
10445 Py_DECREF(sub);
10446 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010447 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010448 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010449 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 if (!buf2) {
10452 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010453 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 return -1;
10455 }
10456 len1 = PyUnicode_GET_LENGTH(str);
10457 len2 = PyUnicode_GET_LENGTH(sub);
10458
Benjamin Petersonead6b532011-12-20 17:23:42 -060010459 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 case PyUnicode_1BYTE_KIND:
10461 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10462 break;
10463 case PyUnicode_2BYTE_KIND:
10464 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10465 break;
10466 case PyUnicode_4BYTE_KIND:
10467 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10468 break;
10469 default:
10470 result = -1;
10471 assert(0);
10472 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010473
10474 Py_DECREF(str);
10475 Py_DECREF(sub);
10476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 if (kind2 != kind)
10478 PyMem_Free(buf2);
10479
Guido van Rossum403d68b2000-03-13 15:55:09 +000010480 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010481}
10482
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483/* Concat to string or Unicode object giving a new Unicode object. */
10484
Alexander Belopolsky40018472011-02-26 01:02:56 +000010485PyObject *
10486PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010489 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010490 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491
10492 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010495 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010498 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499
10500 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010501 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010502 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010504 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010505 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010506 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508 }
10509
Victor Stinner488fa492011-12-12 00:01:39 +010010510 u_len = PyUnicode_GET_LENGTH(u);
10511 v_len = PyUnicode_GET_LENGTH(v);
10512 if (u_len > PY_SSIZE_T_MAX - v_len) {
10513 PyErr_SetString(PyExc_OverflowError,
10514 "strings are too large to concat");
10515 goto onError;
10516 }
10517 new_len = u_len + v_len;
10518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010520 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010521 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010524 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010526 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010527 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10528 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529 Py_DECREF(u);
10530 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010531 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533
Benjamin Peterson29060642009-01-31 22:14:21 +000010534 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535 Py_XDECREF(u);
10536 Py_XDECREF(v);
10537 return NULL;
10538}
10539
Walter Dörwald1ab83302007-05-18 17:15:44 +000010540void
Victor Stinner23e56682011-10-03 03:54:37 +020010541PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010542{
Victor Stinner23e56682011-10-03 03:54:37 +020010543 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010544 Py_UCS4 maxchar, maxchar2;
10545 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010546
10547 if (p_left == NULL) {
10548 if (!PyErr_Occurred())
10549 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010550 return;
10551 }
Victor Stinner23e56682011-10-03 03:54:37 +020010552 left = *p_left;
10553 if (right == NULL || !PyUnicode_Check(left)) {
10554 if (!PyErr_Occurred())
10555 PyErr_BadInternalCall();
10556 goto error;
10557 }
10558
Benjamin Petersonbac79492012-01-14 13:34:47 -050010559 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010560 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010561 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010562 goto error;
10563
Victor Stinner488fa492011-12-12 00:01:39 +010010564 /* Shortcuts */
10565 if (left == unicode_empty) {
10566 Py_DECREF(left);
10567 Py_INCREF(right);
10568 *p_left = right;
10569 return;
10570 }
10571 if (right == unicode_empty)
10572 return;
10573
10574 left_len = PyUnicode_GET_LENGTH(left);
10575 right_len = PyUnicode_GET_LENGTH(right);
10576 if (left_len > PY_SSIZE_T_MAX - right_len) {
10577 PyErr_SetString(PyExc_OverflowError,
10578 "strings are too large to concat");
10579 goto error;
10580 }
10581 new_len = left_len + right_len;
10582
10583 if (unicode_modifiable(left)
10584 && PyUnicode_CheckExact(right)
10585 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010586 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10587 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010588 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010589 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010590 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10591 {
10592 /* append inplace */
10593 if (unicode_resize(p_left, new_len) != 0) {
10594 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10595 * deallocated so it cannot be put back into
10596 * 'variable'. The MemoryError is raised when there
10597 * is no value in 'variable', which might (very
10598 * remotely) be a cause of incompatibilities.
10599 */
10600 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010601 }
Victor Stinner488fa492011-12-12 00:01:39 +010010602 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010603 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010604 }
Victor Stinner488fa492011-12-12 00:01:39 +010010605 else {
10606 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10607 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010608 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010609
Victor Stinner488fa492011-12-12 00:01:39 +010010610 /* Concat the two Unicode strings */
10611 res = PyUnicode_New(new_len, maxchar);
10612 if (res == NULL)
10613 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010614 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10615 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010616 Py_DECREF(left);
10617 *p_left = res;
10618 }
10619 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010620 return;
10621
10622error:
Victor Stinner488fa492011-12-12 00:01:39 +010010623 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010624}
10625
10626void
10627PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10628{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010629 PyUnicode_Append(pleft, right);
10630 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010631}
10632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010633PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010634 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010636Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010637string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010638interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639
10640static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010641unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010643 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010644 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010645 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 int kind1, kind2, kind;
10648 void *buf1, *buf2;
10649 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010650
Jesus Ceaac451502011-04-20 17:09:23 +020010651 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10652 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010653 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 kind1 = PyUnicode_KIND(self);
10656 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010657 if (kind2 > kind1)
10658 return PyLong_FromLong(0);
10659 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 buf1 = PyUnicode_DATA(self);
10661 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010663 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 if (!buf2) {
10665 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 return NULL;
10667 }
10668 len1 = PyUnicode_GET_LENGTH(self);
10669 len2 = PyUnicode_GET_LENGTH(substring);
10670
10671 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010672 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 case PyUnicode_1BYTE_KIND:
10674 iresult = ucs1lib_count(
10675 ((Py_UCS1*)buf1) + start, end - start,
10676 buf2, len2, PY_SSIZE_T_MAX
10677 );
10678 break;
10679 case PyUnicode_2BYTE_KIND:
10680 iresult = ucs2lib_count(
10681 ((Py_UCS2*)buf1) + start, end - start,
10682 buf2, len2, PY_SSIZE_T_MAX
10683 );
10684 break;
10685 case PyUnicode_4BYTE_KIND:
10686 iresult = ucs4lib_count(
10687 ((Py_UCS4*)buf1) + start, end - start,
10688 buf2, len2, PY_SSIZE_T_MAX
10689 );
10690 break;
10691 default:
10692 assert(0); iresult = 0;
10693 }
10694
10695 result = PyLong_FromSsize_t(iresult);
10696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 if (kind2 != kind)
10698 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699
10700 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010701
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702 return result;
10703}
10704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010705PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010706 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010708Encode S using the codec registered for encoding. Default encoding\n\
10709is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010710handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010711a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10712'xmlcharrefreplace' as well as any other name registered with\n\
10713codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714
10715static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010716unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010718 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719 char *encoding = NULL;
10720 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010721
Benjamin Peterson308d6372009-09-18 21:42:35 +000010722 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10723 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010725 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010726}
10727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010728PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010729 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730\n\
10731Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010732If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733
10734static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010735unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010737 Py_ssize_t i, j, line_pos, src_len, incr;
10738 Py_UCS4 ch;
10739 PyObject *u;
10740 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010742 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010743 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744
10745 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747
Antoine Pitrou22425222011-10-04 19:10:51 +020010748 if (PyUnicode_READY(self) == -1)
10749 return NULL;
10750
Thomas Wouters7e474022000-07-16 12:04:32 +000010751 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010752 src_len = PyUnicode_GET_LENGTH(self);
10753 i = j = line_pos = 0;
10754 kind = PyUnicode_KIND(self);
10755 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010756 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010757 for (; i < src_len; i++) {
10758 ch = PyUnicode_READ(kind, src_data, i);
10759 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010760 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010761 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010762 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010763 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010764 goto overflow;
10765 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010766 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010767 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010770 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010771 goto overflow;
10772 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010774 if (ch == '\n' || ch == '\r')
10775 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010777 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010778 if (!found)
10779 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010780
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010782 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783 if (!u)
10784 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010785 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786
Antoine Pitroue71d5742011-10-04 15:55:09 +020010787 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788
Antoine Pitroue71d5742011-10-04 15:55:09 +020010789 for (; i < src_len; i++) {
10790 ch = PyUnicode_READ(kind, src_data, i);
10791 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010792 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010793 incr = tabsize - (line_pos % tabsize);
10794 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010795 FILL(kind, dest_data, ' ', j, incr);
10796 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010797 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010798 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010799 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010800 line_pos++;
10801 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010802 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010803 if (ch == '\n' || ch == '\r')
10804 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010806 }
10807 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010808 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010809
Antoine Pitroue71d5742011-10-04 15:55:09 +020010810 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010811 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10812 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813}
10814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010815PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010816 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817\n\
10818Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010819such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820arguments start and end are interpreted as in slice notation.\n\
10821\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010822Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823
10824static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010827 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010828 Py_ssize_t start;
10829 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010830 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831
Jesus Ceaac451502011-04-20 17:09:23 +020010832 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10833 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 if (PyUnicode_READY(self) == -1)
10837 return NULL;
10838 if (PyUnicode_READY(substring) == -1)
10839 return NULL;
10840
Victor Stinner7931d9a2011-11-04 00:22:48 +010010841 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842
10843 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845 if (result == -2)
10846 return NULL;
10847
Christian Heimes217cfd12007-12-02 14:31:20 +000010848 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849}
10850
10851static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010852unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010854 void *data;
10855 enum PyUnicode_Kind kind;
10856 Py_UCS4 ch;
10857 PyObject *res;
10858
10859 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10860 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010862 }
10863 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10864 PyErr_SetString(PyExc_IndexError, "string index out of range");
10865 return NULL;
10866 }
10867 kind = PyUnicode_KIND(self);
10868 data = PyUnicode_DATA(self);
10869 ch = PyUnicode_READ(kind, data, index);
10870 if (ch < 256)
10871 return get_latin1_char(ch);
10872
10873 res = PyUnicode_New(1, ch);
10874 if (res == NULL)
10875 return NULL;
10876 kind = PyUnicode_KIND(res);
10877 data = PyUnicode_DATA(res);
10878 PyUnicode_WRITE(kind, data, 0, ch);
10879 assert(_PyUnicode_CheckConsistency(res, 1));
10880 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010881}
10882
Guido van Rossumc2504932007-09-18 19:42:40 +000010883/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010884 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010885static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010886unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887{
Guido van Rossumc2504932007-09-18 19:42:40 +000010888 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010889 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010890
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010891#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050010892 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010893#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 if (_PyUnicode_HASH(self) != -1)
10895 return _PyUnicode_HASH(self);
10896 if (PyUnicode_READY(self) == -1)
10897 return -1;
10898 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010010899 /*
10900 We make the hash of the empty string be 0, rather than using
10901 (prefix ^ suffix), since this slightly obfuscates the hash secret
10902 */
10903 if (len == 0) {
10904 _PyUnicode_HASH(self) = 0;
10905 return 0;
10906 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907
10908 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010010909#define HASH(P) \
10910 x ^= (Py_uhash_t) *P << 7; \
10911 while (--len >= 0) \
10912 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010913
Georg Brandl2fb477c2012-02-21 00:33:36 +010010914 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010915 switch (PyUnicode_KIND(self)) {
10916 case PyUnicode_1BYTE_KIND: {
10917 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10918 HASH(c);
10919 break;
10920 }
10921 case PyUnicode_2BYTE_KIND: {
10922 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10923 HASH(s);
10924 break;
10925 }
10926 default: {
10927 Py_UCS4 *l;
10928 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10929 "Impossible switch case in unicode_hash");
10930 l = PyUnicode_4BYTE_DATA(self);
10931 HASH(l);
10932 break;
10933 }
10934 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010010935 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
10936 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937
Guido van Rossumc2504932007-09-18 19:42:40 +000010938 if (x == -1)
10939 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010940 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010941 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010945PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010946 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010948Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949
10950static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010953 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010954 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010955 Py_ssize_t start;
10956 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957
Jesus Ceaac451502011-04-20 17:09:23 +020010958 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10959 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 if (PyUnicode_READY(self) == -1)
10963 return NULL;
10964 if (PyUnicode_READY(substring) == -1)
10965 return NULL;
10966
Victor Stinner7931d9a2011-11-04 00:22:48 +010010967 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968
10969 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 if (result == -2)
10972 return NULL;
10973
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974 if (result < 0) {
10975 PyErr_SetString(PyExc_ValueError, "substring not found");
10976 return NULL;
10977 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010978
Christian Heimes217cfd12007-12-02 14:31:20 +000010979 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980}
10981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010982PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010983 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010985Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010986at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987
10988static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010989unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 Py_ssize_t i, length;
10992 int kind;
10993 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994 int cased;
10995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 if (PyUnicode_READY(self) == -1)
10997 return NULL;
10998 length = PyUnicode_GET_LENGTH(self);
10999 kind = PyUnicode_KIND(self);
11000 data = PyUnicode_DATA(self);
11001
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 if (length == 1)
11004 return PyBool_FromLong(
11005 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011007 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011009 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011010
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 for (i = 0; i < length; i++) {
11013 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011014
Benjamin Peterson29060642009-01-31 22:14:21 +000011015 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11016 return PyBool_FromLong(0);
11017 else if (!cased && Py_UNICODE_ISLOWER(ch))
11018 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011020 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021}
11022
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011023PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011024 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011026Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011027at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028
11029static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011030unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011032 Py_ssize_t i, length;
11033 int kind;
11034 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035 int cased;
11036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 if (PyUnicode_READY(self) == -1)
11038 return NULL;
11039 length = PyUnicode_GET_LENGTH(self);
11040 kind = PyUnicode_KIND(self);
11041 data = PyUnicode_DATA(self);
11042
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 if (length == 1)
11045 return PyBool_FromLong(
11046 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011048 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011050 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011051
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 for (i = 0; i < length; i++) {
11054 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011055
Benjamin Peterson29060642009-01-31 22:14:21 +000011056 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11057 return PyBool_FromLong(0);
11058 else if (!cased && Py_UNICODE_ISUPPER(ch))
11059 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011061 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062}
11063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011064PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011065 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011067Return True if S is a titlecased string and there is at least one\n\
11068character in S, i.e. upper- and titlecase characters may only\n\
11069follow uncased characters and lowercase characters only cased ones.\n\
11070Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071
11072static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011073unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011075 Py_ssize_t i, length;
11076 int kind;
11077 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078 int cased, previous_is_cased;
11079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 if (PyUnicode_READY(self) == -1)
11081 return NULL;
11082 length = PyUnicode_GET_LENGTH(self);
11083 kind = PyUnicode_KIND(self);
11084 data = PyUnicode_DATA(self);
11085
Guido van Rossumd57fd912000-03-10 22:53:23 +000011086 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 if (length == 1) {
11088 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11089 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11090 (Py_UNICODE_ISUPPER(ch) != 0));
11091 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011093 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011095 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011096
Guido van Rossumd57fd912000-03-10 22:53:23 +000011097 cased = 0;
11098 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099 for (i = 0; i < length; i++) {
11100 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011101
Benjamin Peterson29060642009-01-31 22:14:21 +000011102 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11103 if (previous_is_cased)
11104 return PyBool_FromLong(0);
11105 previous_is_cased = 1;
11106 cased = 1;
11107 }
11108 else if (Py_UNICODE_ISLOWER(ch)) {
11109 if (!previous_is_cased)
11110 return PyBool_FromLong(0);
11111 previous_is_cased = 1;
11112 cased = 1;
11113 }
11114 else
11115 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011117 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118}
11119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011120PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011121 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011123Return True if all characters in S are whitespace\n\
11124and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125
11126static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011127unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011129 Py_ssize_t i, length;
11130 int kind;
11131 void *data;
11132
11133 if (PyUnicode_READY(self) == -1)
11134 return NULL;
11135 length = PyUnicode_GET_LENGTH(self);
11136 kind = PyUnicode_KIND(self);
11137 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140 if (length == 1)
11141 return PyBool_FromLong(
11142 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011144 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011145 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 for (i = 0; i < length; i++) {
11149 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011150 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011151 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011153 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154}
11155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011156PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011157 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011158\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011159Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011160and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011161
11162static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011163unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 Py_ssize_t i, length;
11166 int kind;
11167 void *data;
11168
11169 if (PyUnicode_READY(self) == -1)
11170 return NULL;
11171 length = PyUnicode_GET_LENGTH(self);
11172 kind = PyUnicode_KIND(self);
11173 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011174
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011175 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011176 if (length == 1)
11177 return PyBool_FromLong(
11178 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011179
11180 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011182 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 for (i = 0; i < length; i++) {
11185 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011186 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011187 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011188 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011189}
11190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011191PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011192 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011193\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011194Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011195and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011196
11197static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011198unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011199{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 int kind;
11201 void *data;
11202 Py_ssize_t len, i;
11203
11204 if (PyUnicode_READY(self) == -1)
11205 return NULL;
11206
11207 kind = PyUnicode_KIND(self);
11208 data = PyUnicode_DATA(self);
11209 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011210
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011211 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 if (len == 1) {
11213 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11214 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11215 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011216
11217 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011219 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221 for (i = 0; i < len; i++) {
11222 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011223 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011224 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011225 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011226 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011227}
11228
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011229PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011230 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011232Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011233False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234
11235static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011236unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 Py_ssize_t i, length;
11239 int kind;
11240 void *data;
11241
11242 if (PyUnicode_READY(self) == -1)
11243 return NULL;
11244 length = PyUnicode_GET_LENGTH(self);
11245 kind = PyUnicode_KIND(self);
11246 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 if (length == 1)
11250 return PyBool_FromLong(
11251 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011253 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011255 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011257 for (i = 0; i < length; i++) {
11258 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011259 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011261 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262}
11263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011264PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011267Return True if all characters in S are digits\n\
11268and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269
11270static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011271unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273 Py_ssize_t i, length;
11274 int kind;
11275 void *data;
11276
11277 if (PyUnicode_READY(self) == -1)
11278 return NULL;
11279 length = PyUnicode_GET_LENGTH(self);
11280 kind = PyUnicode_KIND(self);
11281 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284 if (length == 1) {
11285 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11286 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11287 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011289 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011291 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011293 for (i = 0; i < length; i++) {
11294 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011295 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011297 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298}
11299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011300PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011301 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011303Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011304False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305
11306static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011307unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 Py_ssize_t i, length;
11310 int kind;
11311 void *data;
11312
11313 if (PyUnicode_READY(self) == -1)
11314 return NULL;
11315 length = PyUnicode_GET_LENGTH(self);
11316 kind = PyUnicode_KIND(self);
11317 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 if (length == 1)
11321 return PyBool_FromLong(
11322 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011324 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 for (i = 0; i < length; i++) {
11329 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011330 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011332 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333}
11334
Martin v. Löwis47383402007-08-15 07:32:56 +000011335int
11336PyUnicode_IsIdentifier(PyObject *self)
11337{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 int kind;
11339 void *data;
11340 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011341 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 if (PyUnicode_READY(self) == -1) {
11344 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011345 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346 }
11347
11348 /* Special case for empty strings */
11349 if (PyUnicode_GET_LENGTH(self) == 0)
11350 return 0;
11351 kind = PyUnicode_KIND(self);
11352 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011353
11354 /* PEP 3131 says that the first character must be in
11355 XID_Start and subsequent characters in XID_Continue,
11356 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011357 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011358 letters, digits, underscore). However, given the current
11359 definition of XID_Start and XID_Continue, it is sufficient
11360 to check just for these, except that _ must be allowed
11361 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011363 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011364 return 0;
11365
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011366 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011368 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011369 return 1;
11370}
11371
11372PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011373 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011374\n\
11375Return True if S is a valid identifier according\n\
11376to the language definition.");
11377
11378static PyObject*
11379unicode_isidentifier(PyObject *self)
11380{
11381 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11382}
11383
Georg Brandl559e5d72008-06-11 18:37:52 +000011384PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011386\n\
11387Return True if all characters in S are considered\n\
11388printable in repr() or S is empty, False otherwise.");
11389
11390static PyObject*
11391unicode_isprintable(PyObject *self)
11392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 Py_ssize_t i, length;
11394 int kind;
11395 void *data;
11396
11397 if (PyUnicode_READY(self) == -1)
11398 return NULL;
11399 length = PyUnicode_GET_LENGTH(self);
11400 kind = PyUnicode_KIND(self);
11401 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011402
11403 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 if (length == 1)
11405 return PyBool_FromLong(
11406 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 for (i = 0; i < length; i++) {
11409 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011410 Py_RETURN_FALSE;
11411 }
11412 }
11413 Py_RETURN_TRUE;
11414}
11415
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011416PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011417 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418\n\
11419Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011420iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
11422static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011423unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011425 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426}
11427
Martin v. Löwis18e16552006-02-15 17:27:45 +000011428static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011429unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 if (PyUnicode_READY(self) == -1)
11432 return -1;
11433 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434}
11435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011436PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011437 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011439Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011440done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
11442static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011443unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011445 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 Py_UCS4 fillchar = ' ';
11447
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011448 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449 return NULL;
11450
Benjamin Petersonbac79492012-01-14 13:34:47 -050011451 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011452 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453
Victor Stinnerc4b49542011-12-11 22:44:26 +010011454 if (PyUnicode_GET_LENGTH(self) >= width)
11455 return unicode_result_unchanged(self);
11456
11457 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458}
11459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011460PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011461 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011463Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464
11465static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011466unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011468 if (PyUnicode_READY(self) == -1)
11469 return NULL;
11470 if (PyUnicode_IS_ASCII(self))
11471 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011472 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473}
11474
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011475#define LEFTSTRIP 0
11476#define RIGHTSTRIP 1
11477#define BOTHSTRIP 2
11478
11479/* Arrays indexed by above */
11480static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11481
11482#define STRIPNAME(i) (stripformat[i]+3)
11483
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011484/* externally visible for str.strip(unicode) */
11485PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011486_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 void *data;
11489 int kind;
11490 Py_ssize_t i, j, len;
11491 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11494 return NULL;
11495
11496 kind = PyUnicode_KIND(self);
11497 data = PyUnicode_DATA(self);
11498 len = PyUnicode_GET_LENGTH(self);
11499 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11500 PyUnicode_DATA(sepobj),
11501 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011502
Benjamin Peterson14339b62009-01-31 16:36:08 +000011503 i = 0;
11504 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 while (i < len &&
11506 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 i++;
11508 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011509 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011510
Benjamin Peterson14339b62009-01-31 16:36:08 +000011511 j = len;
11512 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011513 do {
11514 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 } while (j >= i &&
11516 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011517 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011518 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011519
Victor Stinner7931d9a2011-11-04 00:22:48 +010011520 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521}
11522
11523PyObject*
11524PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11525{
11526 unsigned char *data;
11527 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011528 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529
Victor Stinnerde636f32011-10-01 03:55:54 +020011530 if (PyUnicode_READY(self) == -1)
11531 return NULL;
11532
Victor Stinner684d5fd2012-05-03 02:32:34 +020011533 length = PyUnicode_GET_LENGTH(self);
11534 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011535
Victor Stinner684d5fd2012-05-03 02:32:34 +020011536 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011537 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538
Victor Stinnerde636f32011-10-01 03:55:54 +020011539 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011540 PyErr_SetString(PyExc_IndexError, "string index out of range");
11541 return NULL;
11542 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011543 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011544 Py_INCREF(unicode_empty);
11545 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011546 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011547
Victor Stinner684d5fd2012-05-03 02:32:34 +020011548 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011549 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011550 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011551 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011552 }
11553 else {
11554 kind = PyUnicode_KIND(self);
11555 data = PyUnicode_1BYTE_DATA(self);
11556 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011557 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011558 length);
11559 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561
11562static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011563do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 int kind;
11566 void *data;
11567 Py_ssize_t len, i, j;
11568
11569 if (PyUnicode_READY(self) == -1)
11570 return NULL;
11571
11572 kind = PyUnicode_KIND(self);
11573 data = PyUnicode_DATA(self);
11574 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011575
Benjamin Peterson14339b62009-01-31 16:36:08 +000011576 i = 0;
11577 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011579 i++;
11580 }
11581 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011582
Benjamin Peterson14339b62009-01-31 16:36:08 +000011583 j = len;
11584 if (striptype != LEFTSTRIP) {
11585 do {
11586 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011588 j++;
11589 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011590
Victor Stinner7931d9a2011-11-04 00:22:48 +010011591 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592}
11593
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011594
11595static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011596do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011597{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011598 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011599
Benjamin Peterson14339b62009-01-31 16:36:08 +000011600 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11601 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011602
Benjamin Peterson14339b62009-01-31 16:36:08 +000011603 if (sep != NULL && sep != Py_None) {
11604 if (PyUnicode_Check(sep))
11605 return _PyUnicode_XStrip(self, striptype, sep);
11606 else {
11607 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011608 "%s arg must be None or str",
11609 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011610 return NULL;
11611 }
11612 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011613
Benjamin Peterson14339b62009-01-31 16:36:08 +000011614 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011615}
11616
11617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011618PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011620\n\
11621Return a copy of the string S with leading and trailing\n\
11622whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011623If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011624
11625static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011626unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011627{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011628 if (PyTuple_GET_SIZE(args) == 0)
11629 return do_strip(self, BOTHSTRIP); /* Common case */
11630 else
11631 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011632}
11633
11634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011635PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011637\n\
11638Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011639If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011640
11641static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011642unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011643{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011644 if (PyTuple_GET_SIZE(args) == 0)
11645 return do_strip(self, LEFTSTRIP); /* Common case */
11646 else
11647 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011648}
11649
11650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011651PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011652 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011653\n\
11654Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011655If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011656
11657static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011658unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011659{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011660 if (PyTuple_GET_SIZE(args) == 0)
11661 return do_strip(self, RIGHTSTRIP); /* Common case */
11662 else
11663 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011664}
11665
11666
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011668unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011670 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672
Georg Brandl222de0f2009-04-12 12:01:50 +000011673 if (len < 1) {
11674 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011675 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677
Victor Stinnerc4b49542011-12-11 22:44:26 +010011678 /* no repeat, return original string */
11679 if (len == 1)
11680 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011681
Benjamin Petersonbac79492012-01-14 13:34:47 -050011682 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 return NULL;
11684
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011685 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011686 PyErr_SetString(PyExc_OverflowError,
11687 "repeated string is too long");
11688 return NULL;
11689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011691
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011692 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693 if (!u)
11694 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011695 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 if (PyUnicode_GET_LENGTH(str) == 1) {
11698 const int kind = PyUnicode_KIND(str);
11699 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011700 if (kind == PyUnicode_1BYTE_KIND) {
11701 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011702 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011703 }
11704 else if (kind == PyUnicode_2BYTE_KIND) {
11705 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011706 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011707 ucs2[n] = fill_char;
11708 } else {
11709 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11710 assert(kind == PyUnicode_4BYTE_KIND);
11711 for (n = 0; n < len; ++n)
11712 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011713 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 }
11715 else {
11716 /* number of characters copied this far */
11717 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011718 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 char *to = (char *) PyUnicode_DATA(u);
11720 Py_MEMCPY(to, PyUnicode_DATA(str),
11721 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011722 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 n = (done <= nchars-done) ? done : nchars-done;
11724 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011725 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727 }
11728
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011729 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011730 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731}
11732
Alexander Belopolsky40018472011-02-26 01:02:56 +000011733PyObject *
11734PyUnicode_Replace(PyObject *obj,
11735 PyObject *subobj,
11736 PyObject *replobj,
11737 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738{
11739 PyObject *self;
11740 PyObject *str1;
11741 PyObject *str2;
11742 PyObject *result;
11743
11744 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011745 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011748 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011749 Py_DECREF(self);
11750 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751 }
11752 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011753 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011754 Py_DECREF(self);
11755 Py_DECREF(str1);
11756 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011758 if (PyUnicode_READY(self) == -1 ||
11759 PyUnicode_READY(str1) == -1 ||
11760 PyUnicode_READY(str2) == -1)
11761 result = NULL;
11762 else
11763 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764 Py_DECREF(self);
11765 Py_DECREF(str1);
11766 Py_DECREF(str2);
11767 return result;
11768}
11769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011770PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011771 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772\n\
11773Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011774old replaced by new. If the optional argument count is\n\
11775given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776
11777static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 PyObject *str1;
11781 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011782 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783 PyObject *result;
11784
Martin v. Löwis18e16552006-02-15 17:27:45 +000011785 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011787 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011790 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 return NULL;
11792 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011793 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011794 Py_DECREF(str1);
11795 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011796 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011797 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11798 result = NULL;
11799 else
11800 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801
11802 Py_DECREF(str1);
11803 Py_DECREF(str2);
11804 return result;
11805}
11806
Alexander Belopolsky40018472011-02-26 01:02:56 +000011807static PyObject *
11808unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011810 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 Py_ssize_t isize;
11812 Py_ssize_t osize, squote, dquote, i, o;
11813 Py_UCS4 max, quote;
11814 int ikind, okind;
11815 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011818 return NULL;
11819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 isize = PyUnicode_GET_LENGTH(unicode);
11821 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 /* Compute length of output, quote characters, and
11824 maximum character */
11825 osize = 2; /* quotes */
11826 max = 127;
11827 squote = dquote = 0;
11828 ikind = PyUnicode_KIND(unicode);
11829 for (i = 0; i < isize; i++) {
11830 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11831 switch (ch) {
11832 case '\'': squote++; osize++; break;
11833 case '"': dquote++; osize++; break;
11834 case '\\': case '\t': case '\r': case '\n':
11835 osize += 2; break;
11836 default:
11837 /* Fast-path ASCII */
11838 if (ch < ' ' || ch == 0x7f)
11839 osize += 4; /* \xHH */
11840 else if (ch < 0x7f)
11841 osize++;
11842 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11843 osize++;
11844 max = ch > max ? ch : max;
11845 }
11846 else if (ch < 0x100)
11847 osize += 4; /* \xHH */
11848 else if (ch < 0x10000)
11849 osize += 6; /* \uHHHH */
11850 else
11851 osize += 10; /* \uHHHHHHHH */
11852 }
11853 }
11854
11855 quote = '\'';
11856 if (squote) {
11857 if (dquote)
11858 /* Both squote and dquote present. Use squote,
11859 and escape them */
11860 osize += squote;
11861 else
11862 quote = '"';
11863 }
11864
11865 repr = PyUnicode_New(osize, max);
11866 if (repr == NULL)
11867 return NULL;
11868 okind = PyUnicode_KIND(repr);
11869 odata = PyUnicode_DATA(repr);
11870
11871 PyUnicode_WRITE(okind, odata, 0, quote);
11872 PyUnicode_WRITE(okind, odata, osize-1, quote);
11873
11874 for (i = 0, o = 1; i < isize; i++) {
11875 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011876
11877 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 if ((ch == quote) || (ch == '\\')) {
11879 PyUnicode_WRITE(okind, odata, o++, '\\');
11880 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011881 continue;
11882 }
11883
Benjamin Peterson29060642009-01-31 22:14:21 +000011884 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011885 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 PyUnicode_WRITE(okind, odata, o++, '\\');
11887 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011888 }
11889 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 PyUnicode_WRITE(okind, odata, o++, '\\');
11891 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011892 }
11893 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 PyUnicode_WRITE(okind, odata, o++, '\\');
11895 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011896 }
11897
11898 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011899 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 PyUnicode_WRITE(okind, odata, o++, '\\');
11901 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011902 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11903 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011904 }
11905
Georg Brandl559e5d72008-06-11 18:37:52 +000011906 /* Copy ASCII characters as-is */
11907 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011909 }
11910
Benjamin Peterson29060642009-01-31 22:14:21 +000011911 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011912 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011913 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011914 (categories Z* and C* except ASCII space)
11915 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011917 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000011918 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011921 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11922 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011923 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011924 /* Map 16-bit characters to '\uxxxx' */
11925 else if (ch <= 0xffff) {
11926 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011927 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11928 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11929 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11930 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011931 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011932 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011933 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011934 PyUnicode_WRITE(okind, odata, o++, 'U');
11935 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11936 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11937 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11938 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020011939 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11940 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11941 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11942 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011943 }
11944 }
11945 /* Copy characters as-is */
11946 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011948 }
11949 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011952 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011953 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954}
11955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011956PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011957 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958\n\
11959Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011960such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961arguments start and end are interpreted as in slice notation.\n\
11962\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011963Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964
11965static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011968 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011969 Py_ssize_t start;
11970 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011971 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972
Jesus Ceaac451502011-04-20 17:09:23 +020011973 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11974 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011975 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 if (PyUnicode_READY(self) == -1)
11978 return NULL;
11979 if (PyUnicode_READY(substring) == -1)
11980 return NULL;
11981
Victor Stinner7931d9a2011-11-04 00:22:48 +010011982 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983
11984 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 if (result == -2)
11987 return NULL;
11988
Christian Heimes217cfd12007-12-02 14:31:20 +000011989 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990}
11991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011992PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011993 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011995Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996
11997static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012000 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012001 Py_ssize_t start;
12002 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012003 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004
Jesus Ceaac451502011-04-20 17:09:23 +020012005 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12006 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012007 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 if (PyUnicode_READY(self) == -1)
12010 return NULL;
12011 if (PyUnicode_READY(substring) == -1)
12012 return NULL;
12013
Victor Stinner7931d9a2011-11-04 00:22:48 +010012014 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015
12016 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 if (result == -2)
12019 return NULL;
12020
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021 if (result < 0) {
12022 PyErr_SetString(PyExc_ValueError, "substring not found");
12023 return NULL;
12024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025
Christian Heimes217cfd12007-12-02 14:31:20 +000012026 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027}
12028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012029PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012032Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012033done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034
12035static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012036unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012038 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 Py_UCS4 fillchar = ' ';
12040
Victor Stinnere9a29352011-10-01 02:14:59 +020012041 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012043
Benjamin Petersonbac79492012-01-14 13:34:47 -050012044 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045 return NULL;
12046
Victor Stinnerc4b49542011-12-11 22:44:26 +010012047 if (PyUnicode_GET_LENGTH(self) >= width)
12048 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049
Victor Stinnerc4b49542011-12-11 22:44:26 +010012050 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051}
12052
Alexander Belopolsky40018472011-02-26 01:02:56 +000012053PyObject *
12054PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055{
12056 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012057
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058 s = PyUnicode_FromObject(s);
12059 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012060 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012061 if (sep != NULL) {
12062 sep = PyUnicode_FromObject(sep);
12063 if (sep == NULL) {
12064 Py_DECREF(s);
12065 return NULL;
12066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067 }
12068
Victor Stinner9310abb2011-10-05 00:59:23 +020012069 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070
12071 Py_DECREF(s);
12072 Py_XDECREF(sep);
12073 return result;
12074}
12075
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012076PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012077 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078\n\
12079Return a list of the words in S, using sep as the\n\
12080delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012081splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012082whitespace string is a separator and empty strings are\n\
12083removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084
12085static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012086unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012088 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012090 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012092 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12093 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094 return NULL;
12095
12096 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012097 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012099 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012101 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102}
12103
Thomas Wouters477c8d52006-05-27 19:21:47 +000012104PyObject *
12105PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12106{
12107 PyObject* str_obj;
12108 PyObject* sep_obj;
12109 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 int kind1, kind2, kind;
12111 void *buf1 = NULL, *buf2 = NULL;
12112 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012113
12114 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012115 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012117 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012118 if (!sep_obj) {
12119 Py_DECREF(str_obj);
12120 return NULL;
12121 }
12122 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12123 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012124 Py_DECREF(str_obj);
12125 return NULL;
12126 }
12127
Victor Stinner14f8f022011-10-05 20:58:25 +020012128 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012130 kind = Py_MAX(kind1, kind2);
12131 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012133 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 if (!buf1)
12135 goto onError;
12136 buf2 = PyUnicode_DATA(sep_obj);
12137 if (kind2 != kind)
12138 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12139 if (!buf2)
12140 goto onError;
12141 len1 = PyUnicode_GET_LENGTH(str_obj);
12142 len2 = PyUnicode_GET_LENGTH(sep_obj);
12143
Benjamin Petersonead6b532011-12-20 17:23:42 -060012144 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012146 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12147 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12148 else
12149 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 break;
12151 case PyUnicode_2BYTE_KIND:
12152 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12153 break;
12154 case PyUnicode_4BYTE_KIND:
12155 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12156 break;
12157 default:
12158 assert(0);
12159 out = 0;
12160 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012161
12162 Py_DECREF(sep_obj);
12163 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 if (kind1 != kind)
12165 PyMem_Free(buf1);
12166 if (kind2 != kind)
12167 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012168
12169 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 onError:
12171 Py_DECREF(sep_obj);
12172 Py_DECREF(str_obj);
12173 if (kind1 != kind && buf1)
12174 PyMem_Free(buf1);
12175 if (kind2 != kind && buf2)
12176 PyMem_Free(buf2);
12177 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012178}
12179
12180
12181PyObject *
12182PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12183{
12184 PyObject* str_obj;
12185 PyObject* sep_obj;
12186 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012187 int kind1, kind2, kind;
12188 void *buf1 = NULL, *buf2 = NULL;
12189 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012190
12191 str_obj = PyUnicode_FromObject(str_in);
12192 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012193 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012194 sep_obj = PyUnicode_FromObject(sep_in);
12195 if (!sep_obj) {
12196 Py_DECREF(str_obj);
12197 return NULL;
12198 }
12199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 kind1 = PyUnicode_KIND(str_in);
12201 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012202 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 buf1 = PyUnicode_DATA(str_in);
12204 if (kind1 != kind)
12205 buf1 = _PyUnicode_AsKind(str_in, kind);
12206 if (!buf1)
12207 goto onError;
12208 buf2 = PyUnicode_DATA(sep_obj);
12209 if (kind2 != kind)
12210 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12211 if (!buf2)
12212 goto onError;
12213 len1 = PyUnicode_GET_LENGTH(str_obj);
12214 len2 = PyUnicode_GET_LENGTH(sep_obj);
12215
Benjamin Petersonead6b532011-12-20 17:23:42 -060012216 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012218 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12219 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12220 else
12221 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 break;
12223 case PyUnicode_2BYTE_KIND:
12224 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12225 break;
12226 case PyUnicode_4BYTE_KIND:
12227 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12228 break;
12229 default:
12230 assert(0);
12231 out = 0;
12232 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012233
12234 Py_DECREF(sep_obj);
12235 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 if (kind1 != kind)
12237 PyMem_Free(buf1);
12238 if (kind2 != kind)
12239 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012240
12241 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 onError:
12243 Py_DECREF(sep_obj);
12244 Py_DECREF(str_obj);
12245 if (kind1 != kind && buf1)
12246 PyMem_Free(buf1);
12247 if (kind2 != kind && buf2)
12248 PyMem_Free(buf2);
12249 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012250}
12251
12252PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012254\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012255Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012256the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012257found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012258
12259static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012260unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012261{
Victor Stinner9310abb2011-10-05 00:59:23 +020012262 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012263}
12264
12265PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012266 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012267\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012268Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012269the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012270separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012271
12272static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012273unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012274{
Victor Stinner9310abb2011-10-05 00:59:23 +020012275 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012276}
12277
Alexander Belopolsky40018472011-02-26 01:02:56 +000012278PyObject *
12279PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012280{
12281 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012282
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012283 s = PyUnicode_FromObject(s);
12284 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012285 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 if (sep != NULL) {
12287 sep = PyUnicode_FromObject(sep);
12288 if (sep == NULL) {
12289 Py_DECREF(s);
12290 return NULL;
12291 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012292 }
12293
Victor Stinner9310abb2011-10-05 00:59:23 +020012294 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012295
12296 Py_DECREF(s);
12297 Py_XDECREF(sep);
12298 return result;
12299}
12300
12301PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012302 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012303\n\
12304Return a list of the words in S, using sep as the\n\
12305delimiter string, starting at the end of the string and\n\
12306working to the front. If maxsplit is given, at most maxsplit\n\
12307splits are done. If sep is not specified, any whitespace string\n\
12308is a separator.");
12309
12310static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012311unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012312{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012313 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012314 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012315 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012316
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012317 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12318 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012319 return NULL;
12320
12321 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012323 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012324 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012325 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012326 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012327}
12328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012329PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331\n\
12332Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012333Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012334is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335
12336static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012337unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012339 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012340 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012342 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12343 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012344 return NULL;
12345
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012346 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347}
12348
12349static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012350PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012352 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353}
12354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012355PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012356 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357\n\
12358Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012359and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360
12361static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012362unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012364 if (PyUnicode_READY(self) == -1)
12365 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012366 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012367}
12368
Georg Brandlceee0772007-11-27 23:48:05 +000012369PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012370 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012371\n\
12372Return a translation table usable for str.translate().\n\
12373If there is only one argument, it must be a dictionary mapping Unicode\n\
12374ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012375Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012376If there are two arguments, they must be strings of equal length, and\n\
12377in the resulting dictionary, each character in x will be mapped to the\n\
12378character at the same position in y. If there is a third argument, it\n\
12379must be a string, whose characters will be mapped to None in the result.");
12380
12381static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012382unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012383{
12384 PyObject *x, *y = NULL, *z = NULL;
12385 PyObject *new = NULL, *key, *value;
12386 Py_ssize_t i = 0;
12387 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012388
Georg Brandlceee0772007-11-27 23:48:05 +000012389 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12390 return NULL;
12391 new = PyDict_New();
12392 if (!new)
12393 return NULL;
12394 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012395 int x_kind, y_kind, z_kind;
12396 void *x_data, *y_data, *z_data;
12397
Georg Brandlceee0772007-11-27 23:48:05 +000012398 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012399 if (!PyUnicode_Check(x)) {
12400 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12401 "be a string if there is a second argument");
12402 goto err;
12403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012405 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12406 "arguments must have equal length");
12407 goto err;
12408 }
12409 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 x_kind = PyUnicode_KIND(x);
12411 y_kind = PyUnicode_KIND(y);
12412 x_data = PyUnicode_DATA(x);
12413 y_data = PyUnicode_DATA(y);
12414 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12415 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012416 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012417 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012418 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012419 if (!value) {
12420 Py_DECREF(key);
12421 goto err;
12422 }
Georg Brandlceee0772007-11-27 23:48:05 +000012423 res = PyDict_SetItem(new, key, value);
12424 Py_DECREF(key);
12425 Py_DECREF(value);
12426 if (res < 0)
12427 goto err;
12428 }
12429 /* create entries for deleting chars in z */
12430 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 z_kind = PyUnicode_KIND(z);
12432 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012433 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012435 if (!key)
12436 goto err;
12437 res = PyDict_SetItem(new, key, Py_None);
12438 Py_DECREF(key);
12439 if (res < 0)
12440 goto err;
12441 }
12442 }
12443 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 int kind;
12445 void *data;
12446
Georg Brandlceee0772007-11-27 23:48:05 +000012447 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012448 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012449 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12450 "to maketrans it must be a dict");
12451 goto err;
12452 }
12453 /* copy entries into the new dict, converting string keys to int keys */
12454 while (PyDict_Next(x, &i, &key, &value)) {
12455 if (PyUnicode_Check(key)) {
12456 /* convert string keys to integer keys */
12457 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012458 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012459 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12460 "table must be of length 1");
12461 goto err;
12462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 kind = PyUnicode_KIND(key);
12464 data = PyUnicode_DATA(key);
12465 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012466 if (!newkey)
12467 goto err;
12468 res = PyDict_SetItem(new, newkey, value);
12469 Py_DECREF(newkey);
12470 if (res < 0)
12471 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012472 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012473 /* just keep integer keys */
12474 if (PyDict_SetItem(new, key, value) < 0)
12475 goto err;
12476 } else {
12477 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12478 "be strings or integers");
12479 goto err;
12480 }
12481 }
12482 }
12483 return new;
12484 err:
12485 Py_DECREF(new);
12486 return NULL;
12487}
12488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012489PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012490 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491\n\
12492Return a copy of the string S, where all characters have been mapped\n\
12493through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012494Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012495Unmapped characters are left untouched. Characters mapped to None\n\
12496are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497
12498static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012501 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502}
12503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012504PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012505 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012507Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508
12509static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012510unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012512 if (PyUnicode_READY(self) == -1)
12513 return NULL;
12514 if (PyUnicode_IS_ASCII(self))
12515 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012516 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517}
12518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012519PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012520 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012522Pad a numeric string S with zeros on the left, to fill a field\n\
12523of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524
12525static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012526unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012528 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012529 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012530 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 int kind;
12532 void *data;
12533 Py_UCS4 chr;
12534
Martin v. Löwis18e16552006-02-15 17:27:45 +000012535 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536 return NULL;
12537
Benjamin Petersonbac79492012-01-14 13:34:47 -050012538 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540
Victor Stinnerc4b49542011-12-11 22:44:26 +010012541 if (PyUnicode_GET_LENGTH(self) >= width)
12542 return unicode_result_unchanged(self);
12543
12544 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545
12546 u = pad(self, fill, 0, '0');
12547
Walter Dörwald068325e2002-04-15 13:36:47 +000012548 if (u == NULL)
12549 return NULL;
12550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 kind = PyUnicode_KIND(u);
12552 data = PyUnicode_DATA(u);
12553 chr = PyUnicode_READ(kind, data, fill);
12554
12555 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 PyUnicode_WRITE(kind, data, 0, chr);
12558 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559 }
12560
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012561 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012562 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564
12565#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012566static PyObject *
12567unicode__decimal2ascii(PyObject *self)
12568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012570}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571#endif
12572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012573PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012574 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012576Return True if S starts with the specified prefix, False otherwise.\n\
12577With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012578With optional end, stop comparing S at that position.\n\
12579prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580
12581static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012582unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012583 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012585 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012586 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012587 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012588 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012589 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590
Jesus Ceaac451502011-04-20 17:09:23 +020012591 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012593 if (PyTuple_Check(subobj)) {
12594 Py_ssize_t i;
12595 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012596 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012597 if (substring == NULL)
12598 return NULL;
12599 result = tailmatch(self, substring, start, end, -1);
12600 Py_DECREF(substring);
12601 if (result) {
12602 Py_RETURN_TRUE;
12603 }
12604 }
12605 /* nothing matched */
12606 Py_RETURN_FALSE;
12607 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012608 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012609 if (substring == NULL) {
12610 if (PyErr_ExceptionMatches(PyExc_TypeError))
12611 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12612 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012613 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012614 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012615 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012617 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618}
12619
12620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012621PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012622 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012624Return True if S ends with the specified suffix, False otherwise.\n\
12625With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012626With optional end, stop comparing S at that position.\n\
12627suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628
12629static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012630unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012631 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012633 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012634 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012635 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012636 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012637 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638
Jesus Ceaac451502011-04-20 17:09:23 +020012639 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012641 if (PyTuple_Check(subobj)) {
12642 Py_ssize_t i;
12643 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012644 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012645 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012646 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012647 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012648 result = tailmatch(self, substring, start, end, +1);
12649 Py_DECREF(substring);
12650 if (result) {
12651 Py_RETURN_TRUE;
12652 }
12653 }
12654 Py_RETURN_FALSE;
12655 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012656 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012657 if (substring == NULL) {
12658 if (PyErr_ExceptionMatches(PyExc_TypeError))
12659 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12660 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012661 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012662 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012663 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012665 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666}
12667
Victor Stinner202fdca2012-05-07 12:47:02 +020012668Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012669_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012670{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012671 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012672 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12673 writer->data = PyUnicode_DATA(writer->buffer);
12674 writer->kind = PyUnicode_KIND(writer->buffer);
12675}
12676
Victor Stinnerd3f08822012-05-29 12:57:52 +020012677void
12678_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012679{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012680 memset(writer, 0, sizeof(*writer));
12681#ifdef Py_DEBUG
12682 writer->kind = 5; /* invalid kind */
12683#endif
12684 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012685 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012686}
12687
Victor Stinnerd3f08822012-05-29 12:57:52 +020012688int
12689_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12690 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012691{
12692 Py_ssize_t newlen;
12693 PyObject *newbuffer;
12694
Victor Stinnerd3f08822012-05-29 12:57:52 +020012695 assert(length > 0);
12696
Victor Stinner202fdca2012-05-07 12:47:02 +020012697 if (length > PY_SSIZE_T_MAX - writer->pos) {
12698 PyErr_NoMemory();
12699 return -1;
12700 }
12701 newlen = writer->pos + length;
12702
Victor Stinnerd3f08822012-05-29 12:57:52 +020012703 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012704 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012705 /* overallocate 25% to limit the number of resize */
12706 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12707 newlen += newlen / 4;
12708 if (newlen < writer->min_length)
12709 newlen = writer->min_length;
12710 }
12711 writer->buffer = PyUnicode_New(newlen, maxchar);
12712 if (writer->buffer == NULL)
12713 return -1;
12714 _PyUnicodeWriter_Update(writer);
12715 return 0;
12716 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012717
Victor Stinnerd3f08822012-05-29 12:57:52 +020012718 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012719 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012720 /* overallocate 25% to limit the number of resize */
12721 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12722 newlen += newlen / 4;
12723 if (newlen < writer->min_length)
12724 newlen = writer->min_length;
12725 }
12726
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012727 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012728 /* resize + widen */
12729 newbuffer = PyUnicode_New(newlen, maxchar);
12730 if (newbuffer == NULL)
12731 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012732 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12733 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012734 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012735 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012736 }
12737 else {
12738 newbuffer = resize_compact(writer->buffer, newlen);
12739 if (newbuffer == NULL)
12740 return -1;
12741 }
12742 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012743 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012744 }
12745 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012746 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012747 newbuffer = PyUnicode_New(writer->size, maxchar);
12748 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012749 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012750 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12751 writer->buffer, 0, writer->pos);
12752 Py_DECREF(writer->buffer);
12753 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012754 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012755 }
12756 return 0;
12757}
12758
Victor Stinnerd3f08822012-05-29 12:57:52 +020012759int
12760_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12761{
12762 Py_UCS4 maxchar;
12763 Py_ssize_t len;
12764
12765 if (PyUnicode_READY(str) == -1)
12766 return -1;
12767 len = PyUnicode_GET_LENGTH(str);
12768 if (len == 0)
12769 return 0;
12770 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12771 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012772 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012773 Py_INCREF(str);
12774 writer->buffer = str;
12775 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012776 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012777 writer->size = 0;
12778 writer->pos += len;
12779 return 0;
12780 }
12781 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12782 return -1;
12783 }
12784 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12785 str, 0, len);
12786 writer->pos += len;
12787 return 0;
12788}
12789
Victor Stinnere215d962012-10-06 23:03:36 +020012790int
12791_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12792{
12793 Py_UCS4 maxchar;
12794
12795 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12796 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12797 return -1;
12798 unicode_write_cstr(writer->buffer, writer->pos, str, len);
12799 writer->pos += len;
12800 return 0;
12801}
12802
Victor Stinnerd3f08822012-05-29 12:57:52 +020012803PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012804_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012805{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012806 if (writer->pos == 0) {
12807 Py_XDECREF(writer->buffer);
12808 Py_INCREF(unicode_empty);
12809 return unicode_empty;
12810 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012811 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012812 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12813 return writer->buffer;
12814 }
12815 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12816 PyObject *newbuffer;
12817 newbuffer = resize_compact(writer->buffer, writer->pos);
12818 if (newbuffer == NULL) {
12819 Py_DECREF(writer->buffer);
12820 return NULL;
12821 }
12822 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012823 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012824 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012825 return writer->buffer;
12826}
12827
Victor Stinnerd3f08822012-05-29 12:57:52 +020012828void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012829_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012830{
12831 Py_CLEAR(writer->buffer);
12832}
12833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012835
12836PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012837 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012838\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012839Return a formatted version of S, using substitutions from args and kwargs.\n\
12840The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012841
Eric Smith27bbca62010-11-04 17:06:58 +000012842PyDoc_STRVAR(format_map__doc__,
12843 "S.format_map(mapping) -> str\n\
12844\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012845Return a formatted version of S, using substitutions from mapping.\n\
12846The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012847
Eric Smith4a7d76d2008-05-30 18:10:19 +000012848static PyObject *
12849unicode__format__(PyObject* self, PyObject* args)
12850{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012851 PyObject *format_spec;
12852 _PyUnicodeWriter writer;
12853 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012854
12855 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12856 return NULL;
12857
Victor Stinnerd3f08822012-05-29 12:57:52 +020012858 if (PyUnicode_READY(self) == -1)
12859 return NULL;
12860 _PyUnicodeWriter_Init(&writer, 0);
12861 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12862 self, format_spec, 0,
12863 PyUnicode_GET_LENGTH(format_spec));
12864 if (ret == -1) {
12865 _PyUnicodeWriter_Dealloc(&writer);
12866 return NULL;
12867 }
12868 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012869}
12870
Eric Smith8c663262007-08-25 02:26:07 +000012871PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012873\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012874Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012875
12876static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012877unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012878{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012879 Py_ssize_t size;
12880
12881 /* If it's a compact object, account for base structure +
12882 character data. */
12883 if (PyUnicode_IS_COMPACT_ASCII(v))
12884 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12885 else if (PyUnicode_IS_COMPACT(v))
12886 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012887 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012888 else {
12889 /* If it is a two-block object, account for base object, and
12890 for character block if present. */
12891 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012892 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012894 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012895 }
12896 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012897 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012898 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012900 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012901 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012902
12903 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012904}
12905
12906PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012907 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012908
12909static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012910unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012911{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012912 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012913 if (!copy)
12914 return NULL;
12915 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012916}
12917
Guido van Rossumd57fd912000-03-10 22:53:23 +000012918static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012919 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012920 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012921 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12922 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012923 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12924 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050012925 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012926 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12927 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12928 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12929 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12930 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012931 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012932 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12933 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12934 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012935 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012936 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12937 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12938 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012939 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012940 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012941 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012942 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012943 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12944 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12945 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12946 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12947 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12948 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12949 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12950 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12951 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12952 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12953 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12954 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12955 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12956 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012957 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012958 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012959 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012960 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012961 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012962 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012963 {"maketrans", (PyCFunction) unicode_maketrans,
12964 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012965 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012966#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012967 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012968 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012969#endif
12970
Benjamin Peterson14339b62009-01-31 16:36:08 +000012971 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012972 {NULL, NULL}
12973};
12974
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012975static PyObject *
12976unicode_mod(PyObject *v, PyObject *w)
12977{
Brian Curtindfc80e32011-08-10 20:28:54 -050012978 if (!PyUnicode_Check(v))
12979 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012980 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012981}
12982
12983static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012984 0, /*nb_add*/
12985 0, /*nb_subtract*/
12986 0, /*nb_multiply*/
12987 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012988};
12989
Guido van Rossumd57fd912000-03-10 22:53:23 +000012990static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012991 (lenfunc) unicode_length, /* sq_length */
12992 PyUnicode_Concat, /* sq_concat */
12993 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12994 (ssizeargfunc) unicode_getitem, /* sq_item */
12995 0, /* sq_slice */
12996 0, /* sq_ass_item */
12997 0, /* sq_ass_slice */
12998 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012999};
13000
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013001static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013002unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013003{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 if (PyUnicode_READY(self) == -1)
13005 return NULL;
13006
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013007 if (PyIndex_Check(item)) {
13008 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013009 if (i == -1 && PyErr_Occurred())
13010 return NULL;
13011 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013013 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013014 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013015 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013016 PyObject *result;
13017 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013018 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013019 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013022 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013023 return NULL;
13024 }
13025
13026 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013027 Py_INCREF(unicode_empty);
13028 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013029 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013030 slicelength == PyUnicode_GET_LENGTH(self)) {
13031 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013032 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013033 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013034 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013035 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013036 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013037 src_kind = PyUnicode_KIND(self);
13038 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013039 if (!PyUnicode_IS_ASCII(self)) {
13040 kind_limit = kind_maxchar_limit(src_kind);
13041 max_char = 0;
13042 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13043 ch = PyUnicode_READ(src_kind, src_data, cur);
13044 if (ch > max_char) {
13045 max_char = ch;
13046 if (max_char >= kind_limit)
13047 break;
13048 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013049 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013050 }
Victor Stinner55c99112011-10-13 01:17:06 +020013051 else
13052 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013053 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013054 if (result == NULL)
13055 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013056 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013057 dest_data = PyUnicode_DATA(result);
13058
13059 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013060 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13061 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013062 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013063 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013064 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013065 } else {
13066 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13067 return NULL;
13068 }
13069}
13070
13071static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013072 (lenfunc)unicode_length, /* mp_length */
13073 (binaryfunc)unicode_subscript, /* mp_subscript */
13074 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013075};
13076
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078/* Helpers for PyUnicode_Format() */
13079
Victor Stinnera47082312012-10-04 02:19:54 +020013080struct unicode_formatter_t {
13081 PyObject *args;
13082 int args_owned;
13083 Py_ssize_t arglen, argidx;
13084 PyObject *dict;
13085
13086 enum PyUnicode_Kind fmtkind;
13087 Py_ssize_t fmtcnt, fmtpos;
13088 void *fmtdata;
13089 PyObject *fmtstr;
13090
13091 _PyUnicodeWriter writer;
13092};
13093
13094struct unicode_format_arg_t {
13095 Py_UCS4 ch;
13096 int flags;
13097 Py_ssize_t width;
13098 int prec;
13099 int sign;
13100};
13101
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013103unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104{
Victor Stinnera47082312012-10-04 02:19:54 +020013105 Py_ssize_t argidx = ctx->argidx;
13106
13107 if (argidx < ctx->arglen) {
13108 ctx->argidx++;
13109 if (ctx->arglen < 0)
13110 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013111 else
Victor Stinnera47082312012-10-04 02:19:54 +020013112 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113 }
13114 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013115 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116 return NULL;
13117}
13118
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013119/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120
Victor Stinnera47082312012-10-04 02:19:54 +020013121/* Format a float into the writer if the writer is not NULL, or into *p_output
13122 otherwise.
13123
13124 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013125static int
Victor Stinnera47082312012-10-04 02:19:54 +020013126formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13127 PyObject **p_output,
13128 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013130 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013132 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013133 int prec;
13134 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013135
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136 x = PyFloat_AsDouble(v);
13137 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013138 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013139
Victor Stinnera47082312012-10-04 02:19:54 +020013140 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013142 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013143
Victor Stinnera47082312012-10-04 02:19:54 +020013144 if (arg->flags & F_ALT)
13145 dtoa_flags = Py_DTSF_ALT;
13146 else
13147 dtoa_flags = 0;
13148 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013149 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013150 return -1;
13151 len = strlen(p);
13152 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013153 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13154 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013155 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013156 }
Victor Stinner184252a2012-06-16 02:57:41 +020013157 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013158 writer->pos += len;
13159 }
13160 else
13161 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013162 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013163 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013164}
13165
Victor Stinnerd0880d52012-04-27 23:40:13 +020013166/* formatlong() emulates the format codes d, u, o, x and X, and
13167 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13168 * Python's regular ints.
13169 * Return value: a new PyUnicodeObject*, or NULL if error.
13170 * The output string is of the form
13171 * "-"? ("0x" | "0X")? digit+
13172 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13173 * set in flags. The case of hex digits will be correct,
13174 * There will be at least prec digits, zero-filled on the left if
13175 * necessary to get that many.
13176 * val object to be converted
13177 * flags bitmask of format flags; only F_ALT is looked at
13178 * prec minimum number of digits; 0-fill on left if needed
13179 * type a character in [duoxX]; u acts the same as d
13180 *
13181 * CAUTION: o, x and X conversions on regular ints can never
13182 * produce a '-' sign, but can for Python's unbounded ints.
13183 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013184static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013185formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013186{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013187 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013188 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013189 Py_ssize_t i;
13190 int sign; /* 1 if '-', else 0 */
13191 int len; /* number of characters */
13192 Py_ssize_t llen;
13193 int numdigits; /* len == numnondigits + numdigits */
13194 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013195 int prec = arg->prec;
13196 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013197
Victor Stinnerd0880d52012-04-27 23:40:13 +020013198 /* Avoid exceeding SSIZE_T_MAX */
13199 if (prec > INT_MAX-3) {
13200 PyErr_SetString(PyExc_OverflowError,
13201 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013202 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013203 }
13204
13205 assert(PyLong_Check(val));
13206
13207 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013208 default:
13209 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013210 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013211 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013212 case 'u':
13213 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013214 if (PyBool_Check(val))
13215 result = PyNumber_ToBase(val, 10);
13216 else
13217 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013218 break;
13219 case 'o':
13220 numnondigits = 2;
13221 result = PyNumber_ToBase(val, 8);
13222 break;
13223 case 'x':
13224 case 'X':
13225 numnondigits = 2;
13226 result = PyNumber_ToBase(val, 16);
13227 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013228 }
13229 if (!result)
13230 return NULL;
13231
13232 assert(unicode_modifiable(result));
13233 assert(PyUnicode_IS_READY(result));
13234 assert(PyUnicode_IS_ASCII(result));
13235
13236 /* To modify the string in-place, there can only be one reference. */
13237 if (Py_REFCNT(result) != 1) {
13238 PyErr_BadInternalCall();
13239 return NULL;
13240 }
13241 buf = PyUnicode_DATA(result);
13242 llen = PyUnicode_GET_LENGTH(result);
13243 if (llen > INT_MAX) {
13244 PyErr_SetString(PyExc_ValueError,
13245 "string too large in _PyBytes_FormatLong");
13246 return NULL;
13247 }
13248 len = (int)llen;
13249 sign = buf[0] == '-';
13250 numnondigits += sign;
13251 numdigits = len - numnondigits;
13252 assert(numdigits > 0);
13253
13254 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013255 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013256 (type == 'o' || type == 'x' || type == 'X'))) {
13257 assert(buf[sign] == '0');
13258 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13259 buf[sign+1] == 'o');
13260 numnondigits -= 2;
13261 buf += 2;
13262 len -= 2;
13263 if (sign)
13264 buf[0] = '-';
13265 assert(len == numnondigits + numdigits);
13266 assert(numdigits > 0);
13267 }
13268
13269 /* Fill with leading zeroes to meet minimum width. */
13270 if (prec > numdigits) {
13271 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13272 numnondigits + prec);
13273 char *b1;
13274 if (!r1) {
13275 Py_DECREF(result);
13276 return NULL;
13277 }
13278 b1 = PyBytes_AS_STRING(r1);
13279 for (i = 0; i < numnondigits; ++i)
13280 *b1++ = *buf++;
13281 for (i = 0; i < prec - numdigits; i++)
13282 *b1++ = '0';
13283 for (i = 0; i < numdigits; i++)
13284 *b1++ = *buf++;
13285 *b1 = '\0';
13286 Py_DECREF(result);
13287 result = r1;
13288 buf = PyBytes_AS_STRING(result);
13289 len = numnondigits + prec;
13290 }
13291
13292 /* Fix up case for hex conversions. */
13293 if (type == 'X') {
13294 /* Need to convert all lower case letters to upper case.
13295 and need to convert 0x to 0X (and -0x to -0X). */
13296 for (i = 0; i < len; i++)
13297 if (buf[i] >= 'a' && buf[i] <= 'x')
13298 buf[i] -= 'a'-'A';
13299 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013300 if (!PyUnicode_Check(result)
13301 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013302 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013303 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013304 Py_DECREF(result);
13305 result = unicode;
13306 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013307 else if (len != PyUnicode_GET_LENGTH(result)) {
13308 if (PyUnicode_Resize(&result, len) < 0)
13309 Py_CLEAR(result);
13310 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013311 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013312}
13313
Victor Stinner621ef3d2012-10-02 00:33:47 +020013314/* Format an integer.
13315 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013316 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013317 * -1 and raise an exception on error */
13318static int
Victor Stinnera47082312012-10-04 02:19:54 +020013319mainformatlong(PyObject *v,
13320 struct unicode_format_arg_t *arg,
13321 PyObject **p_output,
13322 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013323{
13324 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013325 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013326
13327 if (!PyNumber_Check(v))
13328 goto wrongtype;
13329
13330 if (!PyLong_Check(v)) {
13331 iobj = PyNumber_Long(v);
13332 if (iobj == NULL) {
13333 if (PyErr_ExceptionMatches(PyExc_TypeError))
13334 goto wrongtype;
13335 return -1;
13336 }
13337 assert(PyLong_Check(iobj));
13338 }
13339 else {
13340 iobj = v;
13341 Py_INCREF(iobj);
13342 }
13343
13344 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013345 && arg->width == -1 && arg->prec == -1
13346 && !(arg->flags & (F_SIGN | F_BLANK))
13347 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013348 {
13349 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013350 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013351 int base;
13352
Victor Stinnera47082312012-10-04 02:19:54 +020013353 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013354 {
13355 default:
13356 assert(0 && "'type' not in [diuoxX]");
13357 case 'd':
13358 case 'i':
13359 case 'u':
13360 base = 10;
13361 break;
13362 case 'o':
13363 base = 8;
13364 break;
13365 case 'x':
13366 case 'X':
13367 base = 16;
13368 break;
13369 }
13370
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013371 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13372 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013373 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013374 }
13375 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013376 return 1;
13377 }
13378
Victor Stinnera47082312012-10-04 02:19:54 +020013379 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013380 Py_DECREF(iobj);
13381 if (res == NULL)
13382 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013383 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013384 return 0;
13385
13386wrongtype:
13387 PyErr_Format(PyExc_TypeError,
13388 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013389 "not %.200s",
13390 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013391 return -1;
13392}
13393
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013394static Py_UCS4
13395formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013396{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013397 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013398 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013399 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013400 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013401 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013402 goto onError;
13403 }
13404 else {
13405 /* Integer input truncated to a character */
13406 long x;
13407 x = PyLong_AsLong(v);
13408 if (x == -1 && PyErr_Occurred())
13409 goto onError;
13410
Victor Stinner8faf8212011-12-08 22:14:11 +010013411 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013412 PyErr_SetString(PyExc_OverflowError,
13413 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013414 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013415 }
13416
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013417 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013418 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013419
Benjamin Peterson29060642009-01-31 22:14:21 +000013420 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013421 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013423 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013424}
13425
Victor Stinnera47082312012-10-04 02:19:54 +020013426/* Parse options of an argument: flags, width, precision.
13427 Handle also "%(name)" syntax.
13428
13429 Return 0 if the argument has been formatted into arg->str.
13430 Return 1 if the argument has been written into ctx->writer,
13431 Raise an exception and return -1 on error. */
13432static int
13433unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13434 struct unicode_format_arg_t *arg)
13435{
13436#define FORMAT_READ(ctx) \
13437 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13438
13439 PyObject *v;
13440
13441 arg->ch = FORMAT_READ(ctx);
13442 if (arg->ch == '(') {
13443 /* Get argument value from a dictionary. Example: "%(name)s". */
13444 Py_ssize_t keystart;
13445 Py_ssize_t keylen;
13446 PyObject *key;
13447 int pcount = 1;
13448
13449 if (ctx->dict == NULL) {
13450 PyErr_SetString(PyExc_TypeError,
13451 "format requires a mapping");
13452 return -1;
13453 }
13454 ++ctx->fmtpos;
13455 --ctx->fmtcnt;
13456 keystart = ctx->fmtpos;
13457 /* Skip over balanced parentheses */
13458 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13459 arg->ch = FORMAT_READ(ctx);
13460 if (arg->ch == ')')
13461 --pcount;
13462 else if (arg->ch == '(')
13463 ++pcount;
13464 ctx->fmtpos++;
13465 }
13466 keylen = ctx->fmtpos - keystart - 1;
13467 if (ctx->fmtcnt < 0 || pcount > 0) {
13468 PyErr_SetString(PyExc_ValueError,
13469 "incomplete format key");
13470 return -1;
13471 }
13472 key = PyUnicode_Substring(ctx->fmtstr,
13473 keystart, keystart + keylen);
13474 if (key == NULL)
13475 return -1;
13476 if (ctx->args_owned) {
13477 Py_DECREF(ctx->args);
13478 ctx->args_owned = 0;
13479 }
13480 ctx->args = PyObject_GetItem(ctx->dict, key);
13481 Py_DECREF(key);
13482 if (ctx->args == NULL)
13483 return -1;
13484 ctx->args_owned = 1;
13485 ctx->arglen = -1;
13486 ctx->argidx = -2;
13487 }
13488
13489 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
13490 arg->flags = 0;
13491 while (--ctx->fmtcnt >= 0) {
13492 arg->ch = FORMAT_READ(ctx);
13493 ctx->fmtpos++;
13494 switch (arg->ch) {
13495 case '-': arg->flags |= F_LJUST; continue;
13496 case '+': arg->flags |= F_SIGN; continue;
13497 case ' ': arg->flags |= F_BLANK; continue;
13498 case '#': arg->flags |= F_ALT; continue;
13499 case '0': arg->flags |= F_ZERO; continue;
13500 }
13501 break;
13502 }
13503
13504 /* Parse width. Example: "%10s" => width=10 */
13505 arg->width = -1;
13506 if (arg->ch == '*') {
13507 v = unicode_format_getnextarg(ctx);
13508 if (v == NULL)
13509 return -1;
13510 if (!PyLong_Check(v)) {
13511 PyErr_SetString(PyExc_TypeError,
13512 "* wants int");
13513 return -1;
13514 }
13515 arg->width = PyLong_AsLong(v);
13516 if (arg->width == -1 && PyErr_Occurred())
13517 return -1;
13518 if (arg->width < 0) {
13519 arg->flags |= F_LJUST;
13520 arg->width = -arg->width;
13521 }
13522 if (--ctx->fmtcnt >= 0) {
13523 arg->ch = FORMAT_READ(ctx);
13524 ctx->fmtpos++;
13525 }
13526 }
13527 else if (arg->ch >= '0' && arg->ch <= '9') {
13528 arg->width = arg->ch - '0';
13529 while (--ctx->fmtcnt >= 0) {
13530 arg->ch = FORMAT_READ(ctx);
13531 ctx->fmtpos++;
13532 if (arg->ch < '0' || arg->ch > '9')
13533 break;
13534 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13535 mixing signed and unsigned comparison. Since arg->ch is between
13536 '0' and '9', casting to int is safe. */
13537 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13538 PyErr_SetString(PyExc_ValueError,
13539 "width too big");
13540 return -1;
13541 }
13542 arg->width = arg->width*10 + (arg->ch - '0');
13543 }
13544 }
13545
13546 /* Parse precision. Example: "%.3f" => prec=3 */
13547 arg->prec = -1;
13548 if (arg->ch == '.') {
13549 arg->prec = 0;
13550 if (--ctx->fmtcnt >= 0) {
13551 arg->ch = FORMAT_READ(ctx);
13552 ctx->fmtpos++;
13553 }
13554 if (arg->ch == '*') {
13555 v = unicode_format_getnextarg(ctx);
13556 if (v == NULL)
13557 return -1;
13558 if (!PyLong_Check(v)) {
13559 PyErr_SetString(PyExc_TypeError,
13560 "* wants int");
13561 return -1;
13562 }
13563 arg->prec = PyLong_AsLong(v);
13564 if (arg->prec == -1 && PyErr_Occurred())
13565 return -1;
13566 if (arg->prec < 0)
13567 arg->prec = 0;
13568 if (--ctx->fmtcnt >= 0) {
13569 arg->ch = FORMAT_READ(ctx);
13570 ctx->fmtpos++;
13571 }
13572 }
13573 else if (arg->ch >= '0' && arg->ch <= '9') {
13574 arg->prec = arg->ch - '0';
13575 while (--ctx->fmtcnt >= 0) {
13576 arg->ch = FORMAT_READ(ctx);
13577 ctx->fmtpos++;
13578 if (arg->ch < '0' || arg->ch > '9')
13579 break;
13580 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13581 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013582 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013583 return -1;
13584 }
13585 arg->prec = arg->prec*10 + (arg->ch - '0');
13586 }
13587 }
13588 }
13589
13590 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13591 if (ctx->fmtcnt >= 0) {
13592 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13593 if (--ctx->fmtcnt >= 0) {
13594 arg->ch = FORMAT_READ(ctx);
13595 ctx->fmtpos++;
13596 }
13597 }
13598 }
13599 if (ctx->fmtcnt < 0) {
13600 PyErr_SetString(PyExc_ValueError,
13601 "incomplete format");
13602 return -1;
13603 }
13604 return 0;
13605
13606#undef FORMAT_READ
13607}
13608
13609/* Format one argument. Supported conversion specifiers:
13610
13611 - "s", "r", "a": any type
13612 - "i", "d", "u", "o", "x", "X": int
13613 - "e", "E", "f", "F", "g", "G": float
13614 - "c": int or str (1 character)
13615
13616 Return 0 if the argument has been formatted into *p_str,
13617 1 if the argument has been written into ctx->writer,
13618 -1 on error. */
13619static int
13620unicode_format_arg_format(struct unicode_formatter_t *ctx,
13621 struct unicode_format_arg_t *arg,
13622 PyObject **p_str)
13623{
13624 PyObject *v;
13625 _PyUnicodeWriter *writer = &ctx->writer;
13626
13627 if (ctx->fmtcnt == 0)
13628 ctx->writer.overallocate = 0;
13629
13630 if (arg->ch == '%') {
13631 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13632 return -1;
13633 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13634 writer->pos += 1;
13635 return 1;
13636 }
13637
13638 v = unicode_format_getnextarg(ctx);
13639 if (v == NULL)
13640 return -1;
13641
13642 arg->sign = 0;
13643
13644 switch (arg->ch) {
13645
13646 case 's':
13647 case 'r':
13648 case 'a':
13649 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13650 /* Fast path */
13651 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13652 return -1;
13653 return 1;
13654 }
13655
13656 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13657 *p_str = v;
13658 Py_INCREF(*p_str);
13659 }
13660 else {
13661 if (arg->ch == 's')
13662 *p_str = PyObject_Str(v);
13663 else if (arg->ch == 'r')
13664 *p_str = PyObject_Repr(v);
13665 else
13666 *p_str = PyObject_ASCII(v);
13667 }
13668 break;
13669
13670 case 'i':
13671 case 'd':
13672 case 'u':
13673 case 'o':
13674 case 'x':
13675 case 'X':
13676 {
13677 int ret = mainformatlong(v, arg, p_str, writer);
13678 if (ret != 0)
13679 return ret;
13680 arg->sign = 1;
13681 break;
13682 }
13683
13684 case 'e':
13685 case 'E':
13686 case 'f':
13687 case 'F':
13688 case 'g':
13689 case 'G':
13690 if (arg->width == -1 && arg->prec == -1
13691 && !(arg->flags & (F_SIGN | F_BLANK)))
13692 {
13693 /* Fast path */
13694 if (formatfloat(v, arg, NULL, writer) == -1)
13695 return -1;
13696 return 1;
13697 }
13698
13699 arg->sign = 1;
13700 if (formatfloat(v, arg, p_str, NULL) == -1)
13701 return -1;
13702 break;
13703
13704 case 'c':
13705 {
13706 Py_UCS4 ch = formatchar(v);
13707 if (ch == (Py_UCS4) -1)
13708 return -1;
13709 if (arg->width == -1 && arg->prec == -1) {
13710 /* Fast path */
13711 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13712 return -1;
13713 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13714 writer->pos += 1;
13715 return 1;
13716 }
13717 *p_str = PyUnicode_FromOrdinal(ch);
13718 break;
13719 }
13720
13721 default:
13722 PyErr_Format(PyExc_ValueError,
13723 "unsupported format character '%c' (0x%x) "
13724 "at index %zd",
13725 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13726 (int)arg->ch,
13727 ctx->fmtpos - 1);
13728 return -1;
13729 }
13730 if (*p_str == NULL)
13731 return -1;
13732 assert (PyUnicode_Check(*p_str));
13733 return 0;
13734}
13735
13736static int
13737unicode_format_arg_output(struct unicode_formatter_t *ctx,
13738 struct unicode_format_arg_t *arg,
13739 PyObject *str)
13740{
13741 Py_ssize_t len;
13742 enum PyUnicode_Kind kind;
13743 void *pbuf;
13744 Py_ssize_t pindex;
13745 Py_UCS4 signchar;
13746 Py_ssize_t buflen;
13747 Py_UCS4 maxchar, bufmaxchar;
13748 Py_ssize_t sublen;
13749 _PyUnicodeWriter *writer = &ctx->writer;
13750 Py_UCS4 fill;
13751
13752 fill = ' ';
13753 if (arg->sign && arg->flags & F_ZERO)
13754 fill = '0';
13755
13756 if (PyUnicode_READY(str) == -1)
13757 return -1;
13758
13759 len = PyUnicode_GET_LENGTH(str);
13760 if ((arg->width == -1 || arg->width <= len)
13761 && (arg->prec == -1 || arg->prec >= len)
13762 && !(arg->flags & (F_SIGN | F_BLANK)))
13763 {
13764 /* Fast path */
13765 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13766 return -1;
13767 return 0;
13768 }
13769
13770 /* Truncate the string for "s", "r" and "a" formats
13771 if the precision is set */
13772 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13773 if (arg->prec >= 0 && len > arg->prec)
13774 len = arg->prec;
13775 }
13776
13777 /* Adjust sign and width */
13778 kind = PyUnicode_KIND(str);
13779 pbuf = PyUnicode_DATA(str);
13780 pindex = 0;
13781 signchar = '\0';
13782 if (arg->sign) {
13783 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13784 if (ch == '-' || ch == '+') {
13785 signchar = ch;
13786 len--;
13787 pindex++;
13788 }
13789 else if (arg->flags & F_SIGN)
13790 signchar = '+';
13791 else if (arg->flags & F_BLANK)
13792 signchar = ' ';
13793 else
13794 arg->sign = 0;
13795 }
13796 if (arg->width < len)
13797 arg->width = len;
13798
13799 /* Prepare the writer */
13800 bufmaxchar = 127;
13801 if (!(arg->flags & F_LJUST)) {
13802 if (arg->sign) {
13803 if ((arg->width-1) > len)
13804 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13805 }
13806 else {
13807 if (arg->width > len)
13808 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13809 }
13810 }
13811 maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13812 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13813 buflen = arg->width;
13814 if (arg->sign && len == arg->width)
13815 buflen++;
13816 if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1)
13817 return -1;
13818
13819 /* Write the sign if needed */
13820 if (arg->sign) {
13821 if (fill != ' ') {
13822 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13823 writer->pos += 1;
13824 }
13825 if (arg->width > len)
13826 arg->width--;
13827 }
13828
13829 /* Write the numeric prefix for "x", "X" and "o" formats
13830 if the alternate form is used.
13831 For example, write "0x" for the "%#x" format. */
13832 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13833 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13834 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
13835 if (fill != ' ') {
13836 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13837 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13838 writer->pos += 2;
13839 pindex += 2;
13840 }
13841 arg->width -= 2;
13842 if (arg->width < 0)
13843 arg->width = 0;
13844 len -= 2;
13845 }
13846
13847 /* Pad left with the fill character if needed */
13848 if (arg->width > len && !(arg->flags & F_LJUST)) {
13849 sublen = arg->width - len;
13850 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
13851 writer->pos += sublen;
13852 arg->width = len;
13853 }
13854
13855 /* If padding with spaces: write sign if needed and/or numeric prefix if
13856 the alternate form is used */
13857 if (fill == ' ') {
13858 if (arg->sign) {
13859 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13860 writer->pos += 1;
13861 }
13862 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13863 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13864 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
13865 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13866 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13867 writer->pos += 2;
13868 pindex += 2;
13869 }
13870 }
13871
13872 /* Write characters */
13873 if (len) {
13874 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13875 str, pindex, len);
13876 writer->pos += len;
13877 }
13878
13879 /* Pad right with the fill character if needed */
13880 if (arg->width > len) {
13881 sublen = arg->width - len;
13882 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
13883 writer->pos += sublen;
13884 }
13885 return 0;
13886}
13887
13888/* Helper of PyUnicode_Format(): format one arg.
13889 Return 0 on success, raise an exception and return -1 on error. */
13890static int
13891unicode_format_arg(struct unicode_formatter_t *ctx)
13892{
13893 struct unicode_format_arg_t arg;
13894 PyObject *str;
13895 int ret;
13896
13897 ret = unicode_format_arg_parse(ctx, &arg);
13898 if (ret == -1)
13899 return -1;
13900
13901 ret = unicode_format_arg_format(ctx, &arg, &str);
13902 if (ret == -1)
13903 return -1;
13904
13905 if (ret != 1) {
13906 ret = unicode_format_arg_output(ctx, &arg, str);
13907 Py_DECREF(str);
13908 if (ret == -1)
13909 return -1;
13910 }
13911
13912 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
13913 PyErr_SetString(PyExc_TypeError,
13914 "not all arguments converted during string formatting");
13915 return -1;
13916 }
13917 return 0;
13918}
13919
Alexander Belopolsky40018472011-02-26 01:02:56 +000013920PyObject *
13921PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922{
Victor Stinnera47082312012-10-04 02:19:54 +020013923 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000013924
Guido van Rossumd57fd912000-03-10 22:53:23 +000013925 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013926 PyErr_BadInternalCall();
13927 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013928 }
Victor Stinnera47082312012-10-04 02:19:54 +020013929
13930 ctx.fmtstr = PyUnicode_FromObject(format);
13931 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020013933 if (PyUnicode_READY(ctx.fmtstr) == -1) {
13934 Py_DECREF(ctx.fmtstr);
13935 return NULL;
13936 }
13937 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
13938 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
13939 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
13940 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013941
Victor Stinnera47082312012-10-04 02:19:54 +020013942 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013943
Guido van Rossumd57fd912000-03-10 22:53:23 +000013944 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020013945 ctx.arglen = PyTuple_Size(args);
13946 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013947 }
13948 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013949 ctx.arglen = -1;
13950 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013951 }
Victor Stinnera47082312012-10-04 02:19:54 +020013952 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013953 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020013954 ctx.dict = args;
13955 else
13956 ctx.dict = NULL;
13957 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013958
Victor Stinnera47082312012-10-04 02:19:54 +020013959 while (--ctx.fmtcnt >= 0) {
13960 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13961 Py_ssize_t nonfmtpos, sublen;
13962 Py_UCS4 maxchar;
13963
13964 nonfmtpos = ctx.fmtpos++;
13965 while (ctx.fmtcnt >= 0 &&
13966 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13967 ctx.fmtpos++;
13968 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013969 }
Victor Stinnera47082312012-10-04 02:19:54 +020013970 if (ctx.fmtcnt < 0) {
13971 ctx.fmtpos--;
13972 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020013973 }
Victor Stinnera47082312012-10-04 02:19:54 +020013974 sublen = ctx.fmtpos - nonfmtpos;
13975 maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr,
Victor Stinneree4544c2012-05-09 22:24:08 +020013976 nonfmtpos, nonfmtpos + sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020013977 if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013978 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013979
Victor Stinnera47082312012-10-04 02:19:54 +020013980 _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos,
13981 ctx.fmtstr, nonfmtpos, sublen);
13982 ctx.writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013983 }
13984 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013985 ctx.fmtpos++;
13986 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013987 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020013988 }
13989 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013990
Victor Stinnera47082312012-10-04 02:19:54 +020013991 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013992 PyErr_SetString(PyExc_TypeError,
13993 "not all arguments converted during string formatting");
13994 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013995 }
13996
Victor Stinnera47082312012-10-04 02:19:54 +020013997 if (ctx.args_owned) {
13998 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013999 }
Victor Stinnera47082312012-10-04 02:19:54 +020014000 Py_DECREF(ctx.fmtstr);
14001 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014002
Benjamin Peterson29060642009-01-31 22:14:21 +000014003 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014004 Py_DECREF(ctx.fmtstr);
14005 _PyUnicodeWriter_Dealloc(&ctx.writer);
14006 if (ctx.args_owned) {
14007 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014008 }
14009 return NULL;
14010}
14011
Jeremy Hylton938ace62002-07-17 16:30:39 +000014012static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014013unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14014
Tim Peters6d6c1a32001-08-02 04:15:00 +000014015static PyObject *
14016unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14017{
Benjamin Peterson29060642009-01-31 22:14:21 +000014018 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014019 static char *kwlist[] = {"object", "encoding", "errors", 0};
14020 char *encoding = NULL;
14021 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014022
Benjamin Peterson14339b62009-01-31 16:36:08 +000014023 if (type != &PyUnicode_Type)
14024 return unicode_subtype_new(type, args, kwds);
14025 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014026 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014027 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014028 if (x == NULL) {
14029 Py_INCREF(unicode_empty);
14030 return unicode_empty;
14031 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014032 if (encoding == NULL && errors == NULL)
14033 return PyObject_Str(x);
14034 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014035 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014036}
14037
Guido van Rossume023fe02001-08-30 03:12:59 +000014038static PyObject *
14039unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14040{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014041 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014042 Py_ssize_t length, char_size;
14043 int share_wstr, share_utf8;
14044 unsigned int kind;
14045 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014046
Benjamin Peterson14339b62009-01-31 16:36:08 +000014047 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014048
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014049 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014050 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014051 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014052 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014053 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014054 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014055 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014056 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014057
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014058 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014059 if (self == NULL) {
14060 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014061 return NULL;
14062 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014063 kind = PyUnicode_KIND(unicode);
14064 length = PyUnicode_GET_LENGTH(unicode);
14065
14066 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014067#ifdef Py_DEBUG
14068 _PyUnicode_HASH(self) = -1;
14069#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014070 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014071#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014072 _PyUnicode_STATE(self).interned = 0;
14073 _PyUnicode_STATE(self).kind = kind;
14074 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014075 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014076 _PyUnicode_STATE(self).ready = 1;
14077 _PyUnicode_WSTR(self) = NULL;
14078 _PyUnicode_UTF8_LENGTH(self) = 0;
14079 _PyUnicode_UTF8(self) = NULL;
14080 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014081 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014082
14083 share_utf8 = 0;
14084 share_wstr = 0;
14085 if (kind == PyUnicode_1BYTE_KIND) {
14086 char_size = 1;
14087 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14088 share_utf8 = 1;
14089 }
14090 else if (kind == PyUnicode_2BYTE_KIND) {
14091 char_size = 2;
14092 if (sizeof(wchar_t) == 2)
14093 share_wstr = 1;
14094 }
14095 else {
14096 assert(kind == PyUnicode_4BYTE_KIND);
14097 char_size = 4;
14098 if (sizeof(wchar_t) == 4)
14099 share_wstr = 1;
14100 }
14101
14102 /* Ensure we won't overflow the length. */
14103 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14104 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014105 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014106 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014107 data = PyObject_MALLOC((length + 1) * char_size);
14108 if (data == NULL) {
14109 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014110 goto onError;
14111 }
14112
Victor Stinnerc3c74152011-10-02 20:39:55 +020014113 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014114 if (share_utf8) {
14115 _PyUnicode_UTF8_LENGTH(self) = length;
14116 _PyUnicode_UTF8(self) = data;
14117 }
14118 if (share_wstr) {
14119 _PyUnicode_WSTR_LENGTH(self) = length;
14120 _PyUnicode_WSTR(self) = (wchar_t *)data;
14121 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014122
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014123 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014124 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014125 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014126#ifdef Py_DEBUG
14127 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14128#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014129 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014130 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014131
14132onError:
14133 Py_DECREF(unicode);
14134 Py_DECREF(self);
14135 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014136}
14137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014138PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014139"str(object='') -> str\n\
14140str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014141\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014142Create a new string object from the given object. If encoding or\n\
14143errors is specified, then the object must expose a data buffer\n\
14144that will be decoded using the given encoding and error handler.\n\
14145Otherwise, returns the result of object.__str__() (if defined)\n\
14146or repr(object).\n\
14147encoding defaults to sys.getdefaultencoding().\n\
14148errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014149
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014150static PyObject *unicode_iter(PyObject *seq);
14151
Guido van Rossumd57fd912000-03-10 22:53:23 +000014152PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014153 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014154 "str", /* tp_name */
14155 sizeof(PyUnicodeObject), /* tp_size */
14156 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014157 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014158 (destructor)unicode_dealloc, /* tp_dealloc */
14159 0, /* tp_print */
14160 0, /* tp_getattr */
14161 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014162 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014163 unicode_repr, /* tp_repr */
14164 &unicode_as_number, /* tp_as_number */
14165 &unicode_as_sequence, /* tp_as_sequence */
14166 &unicode_as_mapping, /* tp_as_mapping */
14167 (hashfunc) unicode_hash, /* tp_hash*/
14168 0, /* tp_call*/
14169 (reprfunc) unicode_str, /* tp_str */
14170 PyObject_GenericGetAttr, /* tp_getattro */
14171 0, /* tp_setattro */
14172 0, /* tp_as_buffer */
14173 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014174 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014175 unicode_doc, /* tp_doc */
14176 0, /* tp_traverse */
14177 0, /* tp_clear */
14178 PyUnicode_RichCompare, /* tp_richcompare */
14179 0, /* tp_weaklistoffset */
14180 unicode_iter, /* tp_iter */
14181 0, /* tp_iternext */
14182 unicode_methods, /* tp_methods */
14183 0, /* tp_members */
14184 0, /* tp_getset */
14185 &PyBaseObject_Type, /* tp_base */
14186 0, /* tp_dict */
14187 0, /* tp_descr_get */
14188 0, /* tp_descr_set */
14189 0, /* tp_dictoffset */
14190 0, /* tp_init */
14191 0, /* tp_alloc */
14192 unicode_new, /* tp_new */
14193 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014194};
14195
14196/* Initialize the Unicode implementation */
14197
Victor Stinner3a50e702011-10-18 21:21:00 +020014198int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014199{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014200 int i;
14201
Thomas Wouters477c8d52006-05-27 19:21:47 +000014202 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014203 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014204 0x000A, /* LINE FEED */
14205 0x000D, /* CARRIAGE RETURN */
14206 0x001C, /* FILE SEPARATOR */
14207 0x001D, /* GROUP SEPARATOR */
14208 0x001E, /* RECORD SEPARATOR */
14209 0x0085, /* NEXT LINE */
14210 0x2028, /* LINE SEPARATOR */
14211 0x2029, /* PARAGRAPH SEPARATOR */
14212 };
14213
Fred Drakee4315f52000-05-09 19:53:39 +000014214 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014215 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014216 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014217 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014218 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014219
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014220 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014221 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014222 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014223 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014224
14225 /* initialize the linebreak bloom filter */
14226 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014227 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014228 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014229
14230 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014231
Benjamin Petersonc4311282012-10-30 23:21:10 -040014232 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14233 Py_FatalError("Can't initialize field name iterator type");
14234
14235 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14236 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014237
Victor Stinner3a50e702011-10-18 21:21:00 +020014238#ifdef HAVE_MBCS
14239 winver.dwOSVersionInfoSize = sizeof(winver);
14240 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14241 PyErr_SetFromWindowsErr(0);
14242 return -1;
14243 }
14244#endif
14245 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014246}
14247
14248/* Finalize the Unicode implementation */
14249
Christian Heimesa156e092008-02-16 07:38:31 +000014250int
14251PyUnicode_ClearFreeList(void)
14252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014253 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014254}
14255
Guido van Rossumd57fd912000-03-10 22:53:23 +000014256void
Thomas Wouters78890102000-07-22 19:25:51 +000014257_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014258{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014259 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014260
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014261 Py_XDECREF(unicode_empty);
14262 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014263
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014264 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014265 if (unicode_latin1[i]) {
14266 Py_DECREF(unicode_latin1[i]);
14267 unicode_latin1[i] = NULL;
14268 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014269 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014270 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014271 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014272}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014273
Walter Dörwald16807132007-05-25 13:52:07 +000014274void
14275PyUnicode_InternInPlace(PyObject **p)
14276{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014277 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014278 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014279#ifdef Py_DEBUG
14280 assert(s != NULL);
14281 assert(_PyUnicode_CHECK(s));
14282#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014283 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014284 return;
14285#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014286 /* If it's a subclass, we don't really know what putting
14287 it in the interned dict might do. */
14288 if (!PyUnicode_CheckExact(s))
14289 return;
14290 if (PyUnicode_CHECK_INTERNED(s))
14291 return;
14292 if (interned == NULL) {
14293 interned = PyDict_New();
14294 if (interned == NULL) {
14295 PyErr_Clear(); /* Don't leave an exception */
14296 return;
14297 }
14298 }
14299 /* It might be that the GetItem call fails even
14300 though the key is present in the dictionary,
14301 namely when this happens during a stack overflow. */
14302 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014303 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014304 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014305
Benjamin Peterson29060642009-01-31 22:14:21 +000014306 if (t) {
14307 Py_INCREF(t);
14308 Py_DECREF(*p);
14309 *p = t;
14310 return;
14311 }
Walter Dörwald16807132007-05-25 13:52:07 +000014312
Benjamin Peterson14339b62009-01-31 16:36:08 +000014313 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014314 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014315 PyErr_Clear();
14316 PyThreadState_GET()->recursion_critical = 0;
14317 return;
14318 }
14319 PyThreadState_GET()->recursion_critical = 0;
14320 /* The two references in interned are not counted by refcnt.
14321 The deallocator will take care of this */
14322 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014323 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014324}
14325
14326void
14327PyUnicode_InternImmortal(PyObject **p)
14328{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014329 PyUnicode_InternInPlace(p);
14330 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014331 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014332 Py_INCREF(*p);
14333 }
Walter Dörwald16807132007-05-25 13:52:07 +000014334}
14335
14336PyObject *
14337PyUnicode_InternFromString(const char *cp)
14338{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014339 PyObject *s = PyUnicode_FromString(cp);
14340 if (s == NULL)
14341 return NULL;
14342 PyUnicode_InternInPlace(&s);
14343 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014344}
14345
Alexander Belopolsky40018472011-02-26 01:02:56 +000014346void
14347_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014348{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014349 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014350 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014351 Py_ssize_t i, n;
14352 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014353
Benjamin Peterson14339b62009-01-31 16:36:08 +000014354 if (interned == NULL || !PyDict_Check(interned))
14355 return;
14356 keys = PyDict_Keys(interned);
14357 if (keys == NULL || !PyList_Check(keys)) {
14358 PyErr_Clear();
14359 return;
14360 }
Walter Dörwald16807132007-05-25 13:52:07 +000014361
Benjamin Peterson14339b62009-01-31 16:36:08 +000014362 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14363 detector, interned unicode strings are not forcibly deallocated;
14364 rather, we give them their stolen references back, and then clear
14365 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014366
Benjamin Peterson14339b62009-01-31 16:36:08 +000014367 n = PyList_GET_SIZE(keys);
14368 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014369 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014370 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014371 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014372 if (PyUnicode_READY(s) == -1) {
14373 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014374 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014375 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014376 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014377 case SSTATE_NOT_INTERNED:
14378 /* XXX Shouldn't happen */
14379 break;
14380 case SSTATE_INTERNED_IMMORTAL:
14381 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014382 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014383 break;
14384 case SSTATE_INTERNED_MORTAL:
14385 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014386 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014387 break;
14388 default:
14389 Py_FatalError("Inconsistent interned string state.");
14390 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014391 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014392 }
14393 fprintf(stderr, "total size of all interned strings: "
14394 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14395 "mortal/immortal\n", mortal_size, immortal_size);
14396 Py_DECREF(keys);
14397 PyDict_Clear(interned);
14398 Py_DECREF(interned);
14399 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014400}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014401
14402
14403/********************* Unicode Iterator **************************/
14404
14405typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014406 PyObject_HEAD
14407 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014408 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014409} unicodeiterobject;
14410
14411static void
14412unicodeiter_dealloc(unicodeiterobject *it)
14413{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014414 _PyObject_GC_UNTRACK(it);
14415 Py_XDECREF(it->it_seq);
14416 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014417}
14418
14419static int
14420unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14421{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014422 Py_VISIT(it->it_seq);
14423 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014424}
14425
14426static PyObject *
14427unicodeiter_next(unicodeiterobject *it)
14428{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014429 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014430
Benjamin Peterson14339b62009-01-31 16:36:08 +000014431 assert(it != NULL);
14432 seq = it->it_seq;
14433 if (seq == NULL)
14434 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014435 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014437 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14438 int kind = PyUnicode_KIND(seq);
14439 void *data = PyUnicode_DATA(seq);
14440 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14441 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014442 if (item != NULL)
14443 ++it->it_index;
14444 return item;
14445 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014446
Benjamin Peterson14339b62009-01-31 16:36:08 +000014447 Py_DECREF(seq);
14448 it->it_seq = NULL;
14449 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014450}
14451
14452static PyObject *
14453unicodeiter_len(unicodeiterobject *it)
14454{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014455 Py_ssize_t len = 0;
14456 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014457 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014458 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014459}
14460
14461PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14462
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014463static PyObject *
14464unicodeiter_reduce(unicodeiterobject *it)
14465{
14466 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014467 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014468 it->it_seq, it->it_index);
14469 } else {
14470 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14471 if (u == NULL)
14472 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014473 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014474 }
14475}
14476
14477PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14478
14479static PyObject *
14480unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14481{
14482 Py_ssize_t index = PyLong_AsSsize_t(state);
14483 if (index == -1 && PyErr_Occurred())
14484 return NULL;
14485 if (index < 0)
14486 index = 0;
14487 it->it_index = index;
14488 Py_RETURN_NONE;
14489}
14490
14491PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14492
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014493static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014494 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014495 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014496 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14497 reduce_doc},
14498 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14499 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014500 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014501};
14502
14503PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014504 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14505 "str_iterator", /* tp_name */
14506 sizeof(unicodeiterobject), /* tp_basicsize */
14507 0, /* tp_itemsize */
14508 /* methods */
14509 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14510 0, /* tp_print */
14511 0, /* tp_getattr */
14512 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014513 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014514 0, /* tp_repr */
14515 0, /* tp_as_number */
14516 0, /* tp_as_sequence */
14517 0, /* tp_as_mapping */
14518 0, /* tp_hash */
14519 0, /* tp_call */
14520 0, /* tp_str */
14521 PyObject_GenericGetAttr, /* tp_getattro */
14522 0, /* tp_setattro */
14523 0, /* tp_as_buffer */
14524 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14525 0, /* tp_doc */
14526 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14527 0, /* tp_clear */
14528 0, /* tp_richcompare */
14529 0, /* tp_weaklistoffset */
14530 PyObject_SelfIter, /* tp_iter */
14531 (iternextfunc)unicodeiter_next, /* tp_iternext */
14532 unicodeiter_methods, /* tp_methods */
14533 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014534};
14535
14536static PyObject *
14537unicode_iter(PyObject *seq)
14538{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014539 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014540
Benjamin Peterson14339b62009-01-31 16:36:08 +000014541 if (!PyUnicode_Check(seq)) {
14542 PyErr_BadInternalCall();
14543 return NULL;
14544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014545 if (PyUnicode_READY(seq) == -1)
14546 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014547 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14548 if (it == NULL)
14549 return NULL;
14550 it->it_index = 0;
14551 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014552 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014553 _PyObject_GC_TRACK(it);
14554 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014555}
14556
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014557
14558size_t
14559Py_UNICODE_strlen(const Py_UNICODE *u)
14560{
14561 int res = 0;
14562 while(*u++)
14563 res++;
14564 return res;
14565}
14566
14567Py_UNICODE*
14568Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14569{
14570 Py_UNICODE *u = s1;
14571 while ((*u++ = *s2++));
14572 return s1;
14573}
14574
14575Py_UNICODE*
14576Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14577{
14578 Py_UNICODE *u = s1;
14579 while ((*u++ = *s2++))
14580 if (n-- == 0)
14581 break;
14582 return s1;
14583}
14584
14585Py_UNICODE*
14586Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14587{
14588 Py_UNICODE *u1 = s1;
14589 u1 += Py_UNICODE_strlen(u1);
14590 Py_UNICODE_strcpy(u1, s2);
14591 return s1;
14592}
14593
14594int
14595Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14596{
14597 while (*s1 && *s2 && *s1 == *s2)
14598 s1++, s2++;
14599 if (*s1 && *s2)
14600 return (*s1 < *s2) ? -1 : +1;
14601 if (*s1)
14602 return 1;
14603 if (*s2)
14604 return -1;
14605 return 0;
14606}
14607
14608int
14609Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14610{
14611 register Py_UNICODE u1, u2;
14612 for (; n != 0; n--) {
14613 u1 = *s1;
14614 u2 = *s2;
14615 if (u1 != u2)
14616 return (u1 < u2) ? -1 : +1;
14617 if (u1 == '\0')
14618 return 0;
14619 s1++;
14620 s2++;
14621 }
14622 return 0;
14623}
14624
14625Py_UNICODE*
14626Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14627{
14628 const Py_UNICODE *p;
14629 for (p = s; *p; p++)
14630 if (*p == c)
14631 return (Py_UNICODE*)p;
14632 return NULL;
14633}
14634
14635Py_UNICODE*
14636Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14637{
14638 const Py_UNICODE *p;
14639 p = s + Py_UNICODE_strlen(s);
14640 while (p != s) {
14641 p--;
14642 if (*p == c)
14643 return (Py_UNICODE*)p;
14644 }
14645 return NULL;
14646}
Victor Stinner331ea922010-08-10 16:37:20 +000014647
Victor Stinner71133ff2010-09-01 23:43:53 +000014648Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014649PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014650{
Victor Stinner577db2c2011-10-11 22:12:48 +020014651 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014652 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014654 if (!PyUnicode_Check(unicode)) {
14655 PyErr_BadArgument();
14656 return NULL;
14657 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014658 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014659 if (u == NULL)
14660 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014661 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014662 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014663 PyErr_NoMemory();
14664 return NULL;
14665 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014666 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014667 size *= sizeof(Py_UNICODE);
14668 copy = PyMem_Malloc(size);
14669 if (copy == NULL) {
14670 PyErr_NoMemory();
14671 return NULL;
14672 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014673 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014674 return copy;
14675}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014676
Georg Brandl66c221e2010-10-14 07:04:07 +000014677/* A _string module, to export formatter_parser and formatter_field_name_split
14678 to the string.Formatter class implemented in Python. */
14679
14680static PyMethodDef _string_methods[] = {
14681 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14682 METH_O, PyDoc_STR("split the argument as a field name")},
14683 {"formatter_parser", (PyCFunction) formatter_parser,
14684 METH_O, PyDoc_STR("parse the argument as a format string")},
14685 {NULL, NULL}
14686};
14687
14688static struct PyModuleDef _string_module = {
14689 PyModuleDef_HEAD_INIT,
14690 "_string",
14691 PyDoc_STR("string helper module"),
14692 0,
14693 _string_methods,
14694 NULL,
14695 NULL,
14696 NULL,
14697 NULL
14698};
14699
14700PyMODINIT_FUNC
14701PyInit__string(void)
14702{
14703 return PyModule_Create(&_string_module);
14704}
14705
14706
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014707#ifdef __cplusplus
14708}
14709#endif