blob: 1f602b7ea6fa4cab69d79e2721999d5fe1c7532a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
52 The globals are initialized by the _PyUnicode_Init() API and should
53 not be used before calling that API.
54
55*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000057
58#ifdef __cplusplus
59extern "C" {
60#endif
61
Victor Stinner8faf8212011-12-08 22:14:11 +010062/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
63#define MAX_UNICODE 0x10ffff
64
Victor Stinner910337b2011-10-03 03:20:16 +020065#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020066# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020067#else
68# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
69#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020070
Victor Stinnere90fe6a2011-10-01 16:48:13 +020071#define _PyUnicode_UTF8(op) \
72 (((PyCompactUnicodeObject*)(op))->utf8)
73#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020074 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020075 assert(PyUnicode_IS_READY(op)), \
76 PyUnicode_IS_COMPACT_ASCII(op) ? \
77 ((char*)((PyASCIIObject*)(op) + 1)) : \
78 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020079#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080 (((PyCompactUnicodeObject*)(op))->utf8_length)
81#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((PyASCIIObject*)(op))->length : \
86 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020087#define _PyUnicode_WSTR(op) \
88 (((PyASCIIObject*)(op))->wstr)
89#define _PyUnicode_WSTR_LENGTH(op) \
90 (((PyCompactUnicodeObject*)(op))->wstr_length)
91#define _PyUnicode_LENGTH(op) \
92 (((PyASCIIObject *)(op))->length)
93#define _PyUnicode_STATE(op) \
94 (((PyASCIIObject *)(op))->state)
95#define _PyUnicode_HASH(op) \
96 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020097#define _PyUnicode_KIND(op) \
98 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020099 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#define _PyUnicode_GET_LENGTH(op) \
101 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200102 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200103#define _PyUnicode_DATA_ANY(op) \
104 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105
Victor Stinnere6abb482012-05-02 01:15:40 +0200106/* Optimized version of Py_MAX() to compute the maximum character:
107 use it when your are computing the second argument of PyUnicode_New() */
108#define MAX_MAXCHAR(maxchar1, maxchar2) \
109 ((maxchar1) | (maxchar2))
110
Victor Stinner910337b2011-10-03 03:20:16 +0200111#undef PyUnicode_READY
112#define PyUnicode_READY(op) \
113 (assert(_PyUnicode_CHECK(op)), \
114 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200115 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100116 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200117
Victor Stinnerc379ead2011-10-03 12:52:27 +0200118#define _PyUnicode_SHARE_UTF8(op) \
119 (assert(_PyUnicode_CHECK(op)), \
120 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
121 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
122#define _PyUnicode_SHARE_WSTR(op) \
123 (assert(_PyUnicode_CHECK(op)), \
124 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
125
Victor Stinner829c0ad2011-10-03 01:08:02 +0200126/* true if the Unicode object has an allocated UTF-8 memory block
127 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_HAS_UTF8_MEMORY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (!PyUnicode_IS_COMPACT_ASCII(op) \
131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (_PyUnicode_WSTR(op) && \
139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200149 to_type *_to = (to_type *) to; \
150 const from_type *_iter = (begin); \
151 const from_type *_end = (end); \
152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Walter Dörwald16807132007-05-25 13:52:07 +0000166/* This dictionary holds all interned unicode strings. Note that references
167 to strings in this dictionary are *not* counted in the string's ob_refcnt.
168 When the interned string reaches a refcnt of 0 the string deallocation
169 function will delete the reference from this dictionary.
170
171 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000172 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000173*/
174static PyObject *interned;
175
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000176/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200177static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200179/* List of static strings. */
180static _Py_Identifier *static_strings;
181
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000182/* Single character Unicode strings in the Latin-1 range are being
183 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200184static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185
Christian Heimes190d79e2008-01-30 11:58:22 +0000186/* Fast detection of the most frequent whitespace characters */
187const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000188 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000190/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000C: * FORM FEED */
193/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 1, 1, 1, 1, 1, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* case 0x001C: * FILE SEPARATOR */
197/* case 0x001D: * GROUP SEPARATOR */
198/* case 0x001E: * RECORD SEPARATOR */
199/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 1, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000206
Benjamin Peterson14339b62009-01-31 16:36:08 +0000207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000215};
216
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200217/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200218static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100220static int unicode_modifiable(PyObject *unicode);
221
Victor Stinnerfe226c02011-10-03 03:52:20 +0200222
Alexander Belopolsky40018472011-02-26 01:02:56 +0000223static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200224_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
225static PyObject *
226_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
227static PyObject *
228_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
229
230static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000232 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100233 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000234 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
235
Alexander Belopolsky40018472011-02-26 01:02:56 +0000236static void
237raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300238 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100239 PyObject *unicode,
240 Py_ssize_t startpos, Py_ssize_t endpos,
241 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000242
Christian Heimes190d79e2008-01-30 11:58:22 +0000243/* Same for linebreaks */
244static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000247/* 0x000B, * LINE TABULATION */
248/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000249/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000250 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000252/* 0x001C, * FILE SEPARATOR */
253/* 0x001D, * GROUP SEPARATOR */
254/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000255 0, 0, 0, 0, 1, 1, 1, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000260
Benjamin Peterson14339b62009-01-31 16:36:08 +0000261 0, 0, 0, 0, 0, 0, 0, 0,
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000269};
270
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300271/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
272 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000273Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000274PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000275{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000276#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000278#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 /* This is actually an illegal character, so it should
280 not be passed to unichr. */
281 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282#endif
283}
284
Victor Stinner910337b2011-10-03 03:20:16 +0200285#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200286int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100287_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200288{
289 PyASCIIObject *ascii;
290 unsigned int kind;
291
292 assert(PyUnicode_Check(op));
293
294 ascii = (PyASCIIObject *)op;
295 kind = ascii->state.kind;
296
Victor Stinnera3b334d2011-10-03 13:53:37 +0200297 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200298 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200299 assert(ascii->state.ready == 1);
300 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200301 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200302 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200303 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200304
Victor Stinnera41463c2011-10-04 01:05:08 +0200305 if (ascii->state.compact == 1) {
306 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND
308 || kind == PyUnicode_2BYTE_KIND
309 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200311 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200312 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100313 }
314 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
316
317 data = unicode->data.any;
318 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100319 assert(ascii->length == 0);
320 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert(ascii->state.compact == 0);
322 assert(ascii->state.ascii == 0);
323 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100324 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 assert(ascii->wstr != NULL);
326 assert(data == NULL);
327 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 }
329 else {
330 assert(kind == PyUnicode_1BYTE_KIND
331 || kind == PyUnicode_2BYTE_KIND
332 || kind == PyUnicode_4BYTE_KIND);
333 assert(ascii->state.compact == 0);
334 assert(ascii->state.ready == 1);
335 assert(data != NULL);
336 if (ascii->state.ascii) {
337 assert (compact->utf8 == data);
338 assert (compact->utf8_length == ascii->length);
339 }
340 else
341 assert (compact->utf8 != data);
342 }
343 }
344 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 if (
346#if SIZEOF_WCHAR_T == 2
347 kind == PyUnicode_2BYTE_KIND
348#else
349 kind == PyUnicode_4BYTE_KIND
350#endif
351 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 {
353 assert(ascii->wstr == data);
354 assert(compact->wstr_length == ascii->length);
355 } else
356 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200357 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200358
359 if (compact->utf8 == NULL)
360 assert(compact->utf8_length == 0);
361 if (ascii->wstr == NULL)
362 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200364 /* check that the best kind is used */
365 if (check_content && kind != PyUnicode_WCHAR_KIND)
366 {
367 Py_ssize_t i;
368 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200369 void *data;
370 Py_UCS4 ch;
371
372 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 for (i=0; i < ascii->length; i++)
374 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200375 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200376 if (ch > maxchar)
377 maxchar = ch;
378 }
379 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100380 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100382 assert(maxchar <= 255);
383 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 else
385 assert(maxchar < 128);
386 }
Victor Stinner77faf692011-11-20 18:56:05 +0100387 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100389 assert(maxchar <= 0xFFFF);
390 }
391 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100393 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100394 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200395 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400397 return 1;
398}
Victor Stinner910337b2011-10-03 03:20:16 +0200399#endif
400
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100401static PyObject*
402unicode_result_wchar(PyObject *unicode)
403{
404#ifndef Py_DEBUG
405 Py_ssize_t len;
406
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100407 len = _PyUnicode_WSTR_LENGTH(unicode);
408 if (len == 0) {
409 Py_INCREF(unicode_empty);
410 Py_DECREF(unicode);
411 return unicode_empty;
412 }
413
414 if (len == 1) {
415 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
416 if (ch < 256) {
417 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
418 Py_DECREF(unicode);
419 return latin1_char;
420 }
421 }
422
423 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200424 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425 return NULL;
426 }
427#else
Victor Stinneraa771272012-10-04 02:32:58 +0200428 assert(Py_REFCNT(unicode) == 1);
429
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100430 /* don't make the result ready in debug mode to ensure that the caller
431 makes the string ready before using it */
432 assert(_PyUnicode_CheckConsistency(unicode, 1));
433#endif
434 return unicode;
435}
436
437static PyObject*
438unicode_result_ready(PyObject *unicode)
439{
440 Py_ssize_t length;
441
442 length = PyUnicode_GET_LENGTH(unicode);
443 if (length == 0) {
444 if (unicode != unicode_empty) {
445 Py_INCREF(unicode_empty);
446 Py_DECREF(unicode);
447 }
448 return unicode_empty;
449 }
450
451 if (length == 1) {
452 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
453 if (ch < 256) {
454 PyObject *latin1_char = unicode_latin1[ch];
455 if (latin1_char != NULL) {
456 if (unicode != latin1_char) {
457 Py_INCREF(latin1_char);
458 Py_DECREF(unicode);
459 }
460 return latin1_char;
461 }
462 else {
463 assert(_PyUnicode_CheckConsistency(unicode, 1));
464 Py_INCREF(unicode);
465 unicode_latin1[ch] = unicode;
466 return unicode;
467 }
468 }
469 }
470
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 return unicode;
473}
474
475static PyObject*
476unicode_result(PyObject *unicode)
477{
478 assert(_PyUnicode_CHECK(unicode));
479 if (PyUnicode_IS_READY(unicode))
480 return unicode_result_ready(unicode);
481 else
482 return unicode_result_wchar(unicode);
483}
484
Victor Stinnerc4b49542011-12-11 22:44:26 +0100485static PyObject*
486unicode_result_unchanged(PyObject *unicode)
487{
488 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500489 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490 return NULL;
491 Py_INCREF(unicode);
492 return unicode;
493 }
494 else
495 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100496 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100497}
498
Victor Stinner3a50e702011-10-18 21:21:00 +0200499#ifdef HAVE_MBCS
500static OSVERSIONINFOEX winver;
501#endif
502
Thomas Wouters477c8d52006-05-27 19:21:47 +0000503/* --- Bloom Filters ----------------------------------------------------- */
504
505/* stuff to implement simple "bloom filters" for Unicode characters.
506 to keep things simple, we use a single bitmask, using the least 5
507 bits from each unicode characters as the bit index. */
508
509/* the linebreak mask is set up by Unicode_Init below */
510
Antoine Pitrouf068f942010-01-13 14:19:12 +0000511#if LONG_BIT >= 128
512#define BLOOM_WIDTH 128
513#elif LONG_BIT >= 64
514#define BLOOM_WIDTH 64
515#elif LONG_BIT >= 32
516#define BLOOM_WIDTH 32
517#else
518#error "LONG_BIT is smaller than 32"
519#endif
520
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521#define BLOOM_MASK unsigned long
522
523static BLOOM_MASK bloom_linebreak;
524
Antoine Pitrouf068f942010-01-13 14:19:12 +0000525#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
526#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000527
Benjamin Peterson29060642009-01-31 22:14:21 +0000528#define BLOOM_LINEBREAK(ch) \
529 ((ch) < 128U ? ascii_linebreak[(ch)] : \
530 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000531
Alexander Belopolsky40018472011-02-26 01:02:56 +0000532Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534{
535 /* calculate simple bloom-style bitmask for a given unicode string */
536
Antoine Pitrouf068f942010-01-13 14:19:12 +0000537 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538 Py_ssize_t i;
539
540 mask = 0;
541 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543
544 return mask;
545}
546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547#define BLOOM_MEMBER(mask, chr, str) \
548 (BLOOM(mask, chr) \
549 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000550
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200551/* Compilation of templated routines */
552
553#include "stringlib/asciilib.h"
554#include "stringlib/fastsearch.h"
555#include "stringlib/partition.h"
556#include "stringlib/split.h"
557#include "stringlib/count.h"
558#include "stringlib/find.h"
559#include "stringlib/find_max_char.h"
560#include "stringlib/localeutil.h"
561#include "stringlib/undef.h"
562
563#include "stringlib/ucs1lib.h"
564#include "stringlib/fastsearch.h"
565#include "stringlib/partition.h"
566#include "stringlib/split.h"
567#include "stringlib/count.h"
568#include "stringlib/find.h"
569#include "stringlib/find_max_char.h"
570#include "stringlib/localeutil.h"
571#include "stringlib/undef.h"
572
573#include "stringlib/ucs2lib.h"
574#include "stringlib/fastsearch.h"
575#include "stringlib/partition.h"
576#include "stringlib/split.h"
577#include "stringlib/count.h"
578#include "stringlib/find.h"
579#include "stringlib/find_max_char.h"
580#include "stringlib/localeutil.h"
581#include "stringlib/undef.h"
582
583#include "stringlib/ucs4lib.h"
584#include "stringlib/fastsearch.h"
585#include "stringlib/partition.h"
586#include "stringlib/split.h"
587#include "stringlib/count.h"
588#include "stringlib/find.h"
589#include "stringlib/find_max_char.h"
590#include "stringlib/localeutil.h"
591#include "stringlib/undef.h"
592
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200593#include "stringlib/unicodedefs.h"
594#include "stringlib/fastsearch.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100597#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599/* --- Unicode Object ----------------------------------------------------- */
600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200601static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200602fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200603
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200604Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
605 Py_ssize_t size, Py_UCS4 ch,
606 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200608 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
609
610 switch (kind) {
611 case PyUnicode_1BYTE_KIND:
612 {
613 Py_UCS1 ch1 = (Py_UCS1) ch;
614 if (ch1 == ch)
615 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
616 else
617 return -1;
618 }
619 case PyUnicode_2BYTE_KIND:
620 {
621 Py_UCS2 ch2 = (Py_UCS2) ch;
622 if (ch2 == ch)
623 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_4BYTE_KIND:
628 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
629 default:
630 assert(0);
631 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633}
634
Victor Stinnerafffce42012-10-03 23:03:17 +0200635#ifdef Py_DEBUG
636/* Fill the data of an Unicode string with invalid characters to detect bugs
637 earlier.
638
639 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
640 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
641 invalid character in Unicode 6.0. */
642static void
643unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
644{
645 int kind = PyUnicode_KIND(unicode);
646 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
647 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
648 if (length <= old_length)
649 return;
650 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
651}
652#endif
653
Victor Stinnerfe226c02011-10-03 03:52:20 +0200654static PyObject*
655resize_compact(PyObject *unicode, Py_ssize_t length)
656{
657 Py_ssize_t char_size;
658 Py_ssize_t struct_size;
659 Py_ssize_t new_size;
660 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100661 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200662#ifdef Py_DEBUG
663 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
664#endif
665
Victor Stinner79891572012-05-03 13:43:07 +0200666 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100668 assert(PyUnicode_IS_COMPACT(unicode));
669
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200670 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100671 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200672 struct_size = sizeof(PyASCIIObject);
673 else
674 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200675 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
678 PyErr_NoMemory();
679 return NULL;
680 }
681 new_size = (struct_size + (length + 1) * char_size);
682
Victor Stinner84def372011-12-11 20:04:56 +0100683 _Py_DEC_REFTOTAL;
684 _Py_ForgetReference(unicode);
685
686 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
687 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100688 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200689 PyErr_NoMemory();
690 return NULL;
691 }
Victor Stinner84def372011-12-11 20:04:56 +0100692 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200696 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100698 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200699 _PyUnicode_WSTR_LENGTH(unicode) = length;
700 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200701#ifdef Py_DEBUG
702 unicode_fill_invalid(unicode, old_length);
703#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
705 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200706 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200707 return unicode;
708}
709
Alexander Belopolsky40018472011-02-26 01:02:56 +0000710static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200711resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712{
Victor Stinner95663112011-10-04 01:03:50 +0200713 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100714 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 if (PyUnicode_IS_READY(unicode)) {
719 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200720 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200722#ifdef Py_DEBUG
723 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
724#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725
726 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200727 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200728 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
729 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730
731 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
732 PyErr_NoMemory();
733 return -1;
734 }
735 new_size = (length + 1) * char_size;
736
Victor Stinner7a9105a2011-12-12 00:13:42 +0100737 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
738 {
739 PyObject_DEL(_PyUnicode_UTF8(unicode));
740 _PyUnicode_UTF8(unicode) = NULL;
741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
742 }
743
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744 data = (PyObject *)PyObject_REALLOC(data, new_size);
745 if (data == NULL) {
746 PyErr_NoMemory();
747 return -1;
748 }
749 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200750 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200751 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200752 _PyUnicode_WSTR_LENGTH(unicode) = length;
753 }
754 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200755 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200756 _PyUnicode_UTF8_LENGTH(unicode) = length;
757 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200758 _PyUnicode_LENGTH(unicode) = length;
759 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200760#ifdef Py_DEBUG
761 unicode_fill_invalid(unicode, old_length);
762#endif
Victor Stinner95663112011-10-04 01:03:50 +0200763 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200764 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 }
Victor Stinner95663112011-10-04 01:03:50 +0200768 assert(_PyUnicode_WSTR(unicode) != NULL);
769
770 /* check for integer overflow */
771 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
772 PyErr_NoMemory();
773 return -1;
774 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100775 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200776 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100777 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200778 if (!wstr) {
779 PyErr_NoMemory();
780 return -1;
781 }
782 _PyUnicode_WSTR(unicode) = wstr;
783 _PyUnicode_WSTR(unicode)[length] = 0;
784 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200785 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000786 return 0;
787}
788
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789static PyObject*
790resize_copy(PyObject *unicode, Py_ssize_t length)
791{
792 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100793 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100795
Benjamin Petersonbac79492012-01-14 13:34:47 -0500796 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100797 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200798
799 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
800 if (copy == NULL)
801 return NULL;
802
803 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200804 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200805 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200806 }
807 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200808 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100809
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200810 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 if (w == NULL)
812 return NULL;
813 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
814 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200815 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
816 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200817 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200818 }
819}
820
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000822 Ux0000 terminated; some code (e.g. new_identifier)
823 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824
825 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000827
828*/
829
Alexander Belopolsky40018472011-02-26 01:02:56 +0000830static PyUnicodeObject *
831_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832{
833 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200834 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835
Thomas Wouters477c8d52006-05-27 19:21:47 +0000836 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000837 if (length == 0 && unicode_empty != NULL) {
838 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200839 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840 }
841
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000842 /* Ensure we won't overflow the size. */
843 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
844 return (PyUnicodeObject *)PyErr_NoMemory();
845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 if (length < 0) {
847 PyErr_SetString(PyExc_SystemError,
848 "Negative size passed to _PyUnicode_New");
849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000850 }
851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
853 if (unicode == NULL)
854 return NULL;
855 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
856 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
857 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100858 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000859 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100860 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200862
Jeremy Hyltond8082792003-09-16 19:41:39 +0000863 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000864 * the caller fails before initializing str -- unicode_resize()
865 * reads str[0], and the Keep-Alive optimization can keep memory
866 * allocated for str alive across a call to unicode_dealloc(unicode).
867 * We don't want unicode_resize to read uninitialized memory in
868 * that case.
869 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870 _PyUnicode_WSTR(unicode)[0] = 0;
871 _PyUnicode_WSTR(unicode)[length] = 0;
872 _PyUnicode_WSTR_LENGTH(unicode) = length;
873 _PyUnicode_HASH(unicode) = -1;
874 _PyUnicode_STATE(unicode).interned = 0;
875 _PyUnicode_STATE(unicode).kind = 0;
876 _PyUnicode_STATE(unicode).compact = 0;
877 _PyUnicode_STATE(unicode).ready = 0;
878 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200879 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200881 _PyUnicode_UTF8(unicode) = NULL;
882 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100883 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 return unicode;
885}
886
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887static const char*
888unicode_kind_name(PyObject *unicode)
889{
Victor Stinner42dfd712011-10-03 14:41:45 +0200890 /* don't check consistency: unicode_kind_name() is called from
891 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 if (!PyUnicode_IS_COMPACT(unicode))
893 {
894 if (!PyUnicode_IS_READY(unicode))
895 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600896 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 {
898 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200899 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 return "legacy ascii";
901 else
902 return "legacy latin1";
903 case PyUnicode_2BYTE_KIND:
904 return "legacy UCS2";
905 case PyUnicode_4BYTE_KIND:
906 return "legacy UCS4";
907 default:
908 return "<legacy invalid kind>";
909 }
910 }
911 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600912 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200913 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200914 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200915 return "ascii";
916 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200917 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200918 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200919 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200920 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200921 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200922 default:
923 return "<invalid compact kind>";
924 }
925}
926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928/* Functions wrapping macros for use in debugger */
929char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200930 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931}
932
933void *_PyUnicode_compact_data(void *unicode) {
934 return _PyUnicode_COMPACT_DATA(unicode);
935}
936void *_PyUnicode_data(void *unicode){
937 printf("obj %p\n", unicode);
938 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
939 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
940 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
941 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
942 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
943 return PyUnicode_DATA(unicode);
944}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200945
946void
947_PyUnicode_Dump(PyObject *op)
948{
949 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
951 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
952 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200953
Victor Stinnera849a4b2011-10-03 12:12:11 +0200954 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200955 {
956 if (ascii->state.ascii)
957 data = (ascii + 1);
958 else
959 data = (compact + 1);
960 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200961 else
962 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200963 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
964
Victor Stinnera849a4b2011-10-03 12:12:11 +0200965 if (ascii->wstr == data)
966 printf("shared ");
967 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200968
Victor Stinnera3b334d2011-10-03 13:53:37 +0200969 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200970 printf(" (%zu), ", compact->wstr_length);
971 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
972 printf("shared ");
973 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200974 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200975 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200976}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977#endif
978
979PyObject *
980PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
981{
982 PyObject *obj;
983 PyCompactUnicodeObject *unicode;
984 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200985 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200986 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 Py_ssize_t char_size;
988 Py_ssize_t struct_size;
989
990 /* Optimization for empty strings */
991 if (size == 0 && unicode_empty != NULL) {
992 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200993 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 }
995
Victor Stinner9e9d6892011-10-04 01:02:02 +0200996 is_ascii = 0;
997 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200998 struct_size = sizeof(PyCompactUnicodeObject);
999 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001000 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 char_size = 1;
1002 is_ascii = 1;
1003 struct_size = sizeof(PyASCIIObject);
1004 }
1005 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001006 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007 char_size = 1;
1008 }
1009 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001010 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001011 char_size = 2;
1012 if (sizeof(wchar_t) == 2)
1013 is_sharing = 1;
1014 }
1015 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001016 if (maxchar > MAX_UNICODE) {
1017 PyErr_SetString(PyExc_SystemError,
1018 "invalid maximum character passed to PyUnicode_New");
1019 return NULL;
1020 }
Victor Stinner8f825062012-04-27 13:55:39 +02001021 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001022 char_size = 4;
1023 if (sizeof(wchar_t) == 4)
1024 is_sharing = 1;
1025 }
1026
1027 /* Ensure we won't overflow the size. */
1028 if (size < 0) {
1029 PyErr_SetString(PyExc_SystemError,
1030 "Negative size passed to PyUnicode_New");
1031 return NULL;
1032 }
1033 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1034 return PyErr_NoMemory();
1035
1036 /* Duplicated allocation code from _PyObject_New() instead of a call to
1037 * PyObject_New() so we are able to allocate space for the object and
1038 * it's data buffer.
1039 */
1040 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1041 if (obj == NULL)
1042 return PyErr_NoMemory();
1043 obj = PyObject_INIT(obj, &PyUnicode_Type);
1044 if (obj == NULL)
1045 return NULL;
1046
1047 unicode = (PyCompactUnicodeObject *)obj;
1048 if (is_ascii)
1049 data = ((PyASCIIObject*)obj) + 1;
1050 else
1051 data = unicode + 1;
1052 _PyUnicode_LENGTH(unicode) = size;
1053 _PyUnicode_HASH(unicode) = -1;
1054 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001055 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 _PyUnicode_STATE(unicode).compact = 1;
1057 _PyUnicode_STATE(unicode).ready = 1;
1058 _PyUnicode_STATE(unicode).ascii = is_ascii;
1059 if (is_ascii) {
1060 ((char*)data)[size] = 0;
1061 _PyUnicode_WSTR(unicode) = NULL;
1062 }
Victor Stinner8f825062012-04-27 13:55:39 +02001063 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 ((char*)data)[size] = 0;
1065 _PyUnicode_WSTR(unicode) = NULL;
1066 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001068 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 else {
1071 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001072 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001073 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001075 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001076 ((Py_UCS4*)data)[size] = 0;
1077 if (is_sharing) {
1078 _PyUnicode_WSTR_LENGTH(unicode) = size;
1079 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1080 }
1081 else {
1082 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1083 _PyUnicode_WSTR(unicode) = NULL;
1084 }
1085 }
Victor Stinner8f825062012-04-27 13:55:39 +02001086#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001087 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001088#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001089 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 return obj;
1091}
1092
1093#if SIZEOF_WCHAR_T == 2
1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1095 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001096 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097
1098 This function assumes that unicode can hold one more code point than wstr
1099 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001100static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103{
1104 const wchar_t *iter;
1105 Py_UCS4 *ucs4_out;
1106
Victor Stinner910337b2011-10-03 03:20:16 +02001107 assert(unicode != NULL);
1108 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1110 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1111
1112 for (iter = begin; iter < end; ) {
1113 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1114 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001115 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1116 && (iter+1) < end
1117 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 {
Victor Stinner551ac952011-11-29 22:58:13 +01001119 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 iter += 2;
1121 }
1122 else {
1123 *ucs4_out++ = *iter;
1124 iter++;
1125 }
1126 }
1127 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1128 _PyUnicode_GET_LENGTH(unicode)));
1129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130}
1131#endif
1132
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133static int
Victor Stinner488fa492011-12-12 00:01:39 +01001134unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001135{
Victor Stinner488fa492011-12-12 00:01:39 +01001136 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001137 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001138 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return -1;
1140 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141 return 0;
1142}
1143
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144static int
1145_copy_characters(PyObject *to, Py_ssize_t to_start,
1146 PyObject *from, Py_ssize_t from_start,
1147 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 unsigned int from_kind, to_kind;
1150 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151
Victor Stinneree4544c2012-05-09 22:24:08 +02001152 assert(0 <= how_many);
1153 assert(0 <= from_start);
1154 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001155 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001157 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158
Victor Stinnerd3f08822012-05-29 12:57:52 +02001159 assert(PyUnicode_Check(to));
1160 assert(PyUnicode_IS_READY(to));
1161 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1162
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001163 if (how_many == 0)
1164 return 0;
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001167 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170
Victor Stinnerf1852262012-06-16 16:38:26 +02001171#ifdef Py_DEBUG
1172 if (!check_maxchar
1173 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1174 {
1175 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1176 Py_UCS4 ch;
1177 Py_ssize_t i;
1178 for (i=0; i < how_many; i++) {
1179 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1180 assert(ch <= to_maxchar);
1181 }
1182 }
1183#endif
1184
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001185 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001186 if (check_maxchar
1187 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1188 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001189 /* Writing Latin-1 characters into an ASCII string requires to
1190 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001191 Py_UCS4 max_char;
1192 max_char = ucs1lib_find_max_char(from_data,
1193 (Py_UCS1*)from_data + how_many);
1194 if (max_char >= 128)
1195 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001196 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001197 Py_MEMCPY((char*)to_data + to_kind * to_start,
1198 (char*)from_data + from_kind * from_start,
1199 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001201 else if (from_kind == PyUnicode_1BYTE_KIND
1202 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 {
1204 _PyUnicode_CONVERT_BYTES(
1205 Py_UCS1, Py_UCS2,
1206 PyUnicode_1BYTE_DATA(from) + from_start,
1207 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1208 PyUnicode_2BYTE_DATA(to) + to_start
1209 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001210 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001211 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001212 && to_kind == PyUnicode_4BYTE_KIND)
1213 {
1214 _PyUnicode_CONVERT_BYTES(
1215 Py_UCS1, Py_UCS4,
1216 PyUnicode_1BYTE_DATA(from) + from_start,
1217 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1218 PyUnicode_4BYTE_DATA(to) + to_start
1219 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001220 }
1221 else if (from_kind == PyUnicode_2BYTE_KIND
1222 && to_kind == PyUnicode_4BYTE_KIND)
1223 {
1224 _PyUnicode_CONVERT_BYTES(
1225 Py_UCS2, Py_UCS4,
1226 PyUnicode_2BYTE_DATA(from) + from_start,
1227 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1228 PyUnicode_4BYTE_DATA(to) + to_start
1229 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001230 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001231 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001232 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1233
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001234 if (!check_maxchar) {
1235 if (from_kind == PyUnicode_2BYTE_KIND
1236 && to_kind == PyUnicode_1BYTE_KIND)
1237 {
1238 _PyUnicode_CONVERT_BYTES(
1239 Py_UCS2, Py_UCS1,
1240 PyUnicode_2BYTE_DATA(from) + from_start,
1241 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1242 PyUnicode_1BYTE_DATA(to) + to_start
1243 );
1244 }
1245 else if (from_kind == PyUnicode_4BYTE_KIND
1246 && to_kind == PyUnicode_1BYTE_KIND)
1247 {
1248 _PyUnicode_CONVERT_BYTES(
1249 Py_UCS4, Py_UCS1,
1250 PyUnicode_4BYTE_DATA(from) + from_start,
1251 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1252 PyUnicode_1BYTE_DATA(to) + to_start
1253 );
1254 }
1255 else if (from_kind == PyUnicode_4BYTE_KIND
1256 && to_kind == PyUnicode_2BYTE_KIND)
1257 {
1258 _PyUnicode_CONVERT_BYTES(
1259 Py_UCS4, Py_UCS2,
1260 PyUnicode_4BYTE_DATA(from) + from_start,
1261 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1262 PyUnicode_2BYTE_DATA(to) + to_start
1263 );
1264 }
1265 else {
1266 assert(0);
1267 return -1;
1268 }
1269 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001270 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001271 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001272 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001273 Py_ssize_t i;
1274
Victor Stinnera0702ab2011-09-29 14:14:38 +02001275 for (i=0; i < how_many; i++) {
1276 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001277 if (ch > to_maxchar)
1278 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001279 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1280 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001281 }
1282 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001283 return 0;
1284}
1285
Victor Stinnerd3f08822012-05-29 12:57:52 +02001286void
1287_PyUnicode_FastCopyCharacters(
1288 PyObject *to, Py_ssize_t to_start,
1289 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001290{
1291 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1292}
1293
1294Py_ssize_t
1295PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1296 PyObject *from, Py_ssize_t from_start,
1297 Py_ssize_t how_many)
1298{
1299 int err;
1300
1301 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1302 PyErr_BadInternalCall();
1303 return -1;
1304 }
1305
Benjamin Petersonbac79492012-01-14 13:34:47 -05001306 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001307 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001308 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001309 return -1;
1310
Victor Stinnerd3f08822012-05-29 12:57:52 +02001311 if (from_start < 0) {
1312 PyErr_SetString(PyExc_IndexError, "string index out of range");
1313 return -1;
1314 }
1315 if (to_start < 0) {
1316 PyErr_SetString(PyExc_IndexError, "string index out of range");
1317 return -1;
1318 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001319 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1320 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1321 PyErr_Format(PyExc_SystemError,
1322 "Cannot write %zi characters at %zi "
1323 "in a string of %zi characters",
1324 how_many, to_start, PyUnicode_GET_LENGTH(to));
1325 return -1;
1326 }
1327
1328 if (how_many == 0)
1329 return 0;
1330
Victor Stinner488fa492011-12-12 00:01:39 +01001331 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001332 return -1;
1333
1334 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1335 if (err) {
1336 PyErr_Format(PyExc_SystemError,
1337 "Cannot copy %s characters "
1338 "into a string of %s characters",
1339 unicode_kind_name(from),
1340 unicode_kind_name(to));
1341 return -1;
1342 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001343 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344}
1345
Victor Stinner17222162011-09-28 22:15:37 +02001346/* Find the maximum code point and count the number of surrogate pairs so a
1347 correct string length can be computed before converting a string to UCS4.
1348 This function counts single surrogates as a character and not as a pair.
1349
1350 Return 0 on success, or -1 on error. */
1351static int
1352find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1353 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354{
1355 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001356 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357
Victor Stinnerc53be962011-10-02 21:33:54 +02001358 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 *num_surrogates = 0;
1360 *maxchar = 0;
1361
1362 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001364 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1365 && (iter+1) < end
1366 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001368 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 iter += 2;
1371 }
1372 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001374 {
1375 ch = *iter;
1376 iter++;
1377 }
1378 if (ch > *maxchar) {
1379 *maxchar = ch;
1380 if (*maxchar > MAX_UNICODE) {
1381 PyErr_Format(PyExc_ValueError,
1382 "character U+%x is not in range [U+0000; U+10ffff]",
1383 ch);
1384 return -1;
1385 }
1386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 }
1388 return 0;
1389}
1390
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001391int
1392_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393{
1394 wchar_t *end;
1395 Py_UCS4 maxchar = 0;
1396 Py_ssize_t num_surrogates;
1397#if SIZEOF_WCHAR_T == 2
1398 Py_ssize_t length_wo_surrogates;
1399#endif
1400
Georg Brandl7597add2011-10-05 16:36:47 +02001401 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001402 strings were created using _PyObject_New() and where no canonical
1403 representation (the str field) has been set yet aka strings
1404 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001405 assert(_PyUnicode_CHECK(unicode));
1406 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001408 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001409 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001410 /* Actually, it should neither be interned nor be anything else: */
1411 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001414 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001415 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417
1418 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001419 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1420 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 PyErr_NoMemory();
1422 return -1;
1423 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001424 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 _PyUnicode_WSTR(unicode), end,
1426 PyUnicode_1BYTE_DATA(unicode));
1427 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1428 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1429 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1430 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001431 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001432 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001433 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 }
1435 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001436 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001437 _PyUnicode_UTF8(unicode) = NULL;
1438 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 }
1440 PyObject_FREE(_PyUnicode_WSTR(unicode));
1441 _PyUnicode_WSTR(unicode) = NULL;
1442 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1443 }
1444 /* In this case we might have to convert down from 4-byte native
1445 wchar_t to 2-byte unicode. */
1446 else if (maxchar < 65536) {
1447 assert(num_surrogates == 0 &&
1448 "FindMaxCharAndNumSurrogatePairs() messed up");
1449
Victor Stinner506f5922011-09-28 22:34:18 +02001450#if SIZEOF_WCHAR_T == 2
1451 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001452 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001453 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1454 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1455 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 _PyUnicode_UTF8(unicode) = NULL;
1457 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001458#else
1459 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001460 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001461 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001462 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001463 PyErr_NoMemory();
1464 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 }
Victor Stinner506f5922011-09-28 22:34:18 +02001466 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1467 _PyUnicode_WSTR(unicode), end,
1468 PyUnicode_2BYTE_DATA(unicode));
1469 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1470 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1471 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001472 _PyUnicode_UTF8(unicode) = NULL;
1473 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001474 PyObject_FREE(_PyUnicode_WSTR(unicode));
1475 _PyUnicode_WSTR(unicode) = NULL;
1476 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1477#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 }
1479 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1480 else {
1481#if SIZEOF_WCHAR_T == 2
1482 /* in case the native representation is 2-bytes, we need to allocate a
1483 new normalized 4-byte version. */
1484 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001485 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1486 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 PyErr_NoMemory();
1488 return -1;
1489 }
1490 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1491 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001492 _PyUnicode_UTF8(unicode) = NULL;
1493 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001494 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1495 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001496 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 PyObject_FREE(_PyUnicode_WSTR(unicode));
1498 _PyUnicode_WSTR(unicode) = NULL;
1499 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1500#else
1501 assert(num_surrogates == 0);
1502
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001505 _PyUnicode_UTF8(unicode) = NULL;
1506 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1508#endif
1509 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1510 }
1511 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001512 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001513 return 0;
1514}
1515
Alexander Belopolsky40018472011-02-26 01:02:56 +00001516static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001517unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518{
Walter Dörwald16807132007-05-25 13:52:07 +00001519 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001520 case SSTATE_NOT_INTERNED:
1521 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001522
Benjamin Peterson29060642009-01-31 22:14:21 +00001523 case SSTATE_INTERNED_MORTAL:
1524 /* revive dead object temporarily for DelItem */
1525 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001526 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001527 Py_FatalError(
1528 "deletion of interned string failed");
1529 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 case SSTATE_INTERNED_IMMORTAL:
1532 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001533
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 default:
1535 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001536 }
1537
Victor Stinner03490912011-10-03 23:45:12 +02001538 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001539 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001540 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001541 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001542 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1543 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001545 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001546}
1547
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548#ifdef Py_DEBUG
1549static int
1550unicode_is_singleton(PyObject *unicode)
1551{
1552 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1553 if (unicode == unicode_empty)
1554 return 1;
1555 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1556 {
1557 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1558 if (ch < 256 && unicode_latin1[ch] == unicode)
1559 return 1;
1560 }
1561 return 0;
1562}
1563#endif
1564
Alexander Belopolsky40018472011-02-26 01:02:56 +00001565static int
Victor Stinner488fa492011-12-12 00:01:39 +01001566unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001567{
Victor Stinner488fa492011-12-12 00:01:39 +01001568 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001569 if (Py_REFCNT(unicode) != 1)
1570 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (_PyUnicode_HASH(unicode) != -1)
1572 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001573 if (PyUnicode_CHECK_INTERNED(unicode))
1574 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001575 if (!PyUnicode_CheckExact(unicode))
1576 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001577#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001578 /* singleton refcount is greater than 1 */
1579 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001580#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 return 1;
1582}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584static int
1585unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1586{
1587 PyObject *unicode;
1588 Py_ssize_t old_length;
1589
1590 assert(p_unicode != NULL);
1591 unicode = *p_unicode;
1592
1593 assert(unicode != NULL);
1594 assert(PyUnicode_Check(unicode));
1595 assert(0 <= length);
1596
Victor Stinner910337b2011-10-03 03:20:16 +02001597 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001598 old_length = PyUnicode_WSTR_LENGTH(unicode);
1599 else
1600 old_length = PyUnicode_GET_LENGTH(unicode);
1601 if (old_length == length)
1602 return 0;
1603
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001604 if (length == 0) {
1605 Py_DECREF(*p_unicode);
1606 *p_unicode = unicode_empty;
1607 Py_INCREF(*p_unicode);
1608 return 0;
1609 }
1610
Victor Stinner488fa492011-12-12 00:01:39 +01001611 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001612 PyObject *copy = resize_copy(unicode, length);
1613 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001614 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 Py_DECREF(*p_unicode);
1616 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618 }
1619
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001621 PyObject *new_unicode = resize_compact(unicode, length);
1622 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001624 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001625 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001626 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001627 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001628}
1629
Alexander Belopolsky40018472011-02-26 01:02:56 +00001630int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 PyObject *unicode;
1634 if (p_unicode == NULL) {
1635 PyErr_BadInternalCall();
1636 return -1;
1637 }
1638 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001639 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001640 {
1641 PyErr_BadInternalCall();
1642 return -1;
1643 }
1644 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001645}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001646
Victor Stinnerc5166102012-02-22 13:55:02 +01001647/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001648
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001649 WARNING: The function doesn't copy the terminating null character and
1650 doesn't check the maximum character (may write a latin1 character in an
1651 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001652static void
1653unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1654 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001655{
1656 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1657 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001658 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001659
1660 switch (kind) {
1661 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001662 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001663#ifdef Py_DEBUG
1664 if (PyUnicode_IS_ASCII(unicode)) {
1665 Py_UCS4 maxchar = ucs1lib_find_max_char(
1666 (const Py_UCS1*)str,
1667 (const Py_UCS1*)str + len);
1668 assert(maxchar < 128);
1669 }
1670#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001671 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001672 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001673 }
1674 case PyUnicode_2BYTE_KIND: {
1675 Py_UCS2 *start = (Py_UCS2 *)data + index;
1676 Py_UCS2 *ucs2 = start;
1677 assert(index <= PyUnicode_GET_LENGTH(unicode));
1678
Victor Stinner184252a2012-06-16 02:57:41 +02001679 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001680 *ucs2 = (Py_UCS2)*str;
1681
1682 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001683 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001684 }
1685 default: {
1686 Py_UCS4 *start = (Py_UCS4 *)data + index;
1687 Py_UCS4 *ucs4 = start;
1688 assert(kind == PyUnicode_4BYTE_KIND);
1689 assert(index <= PyUnicode_GET_LENGTH(unicode));
1690
Victor Stinner184252a2012-06-16 02:57:41 +02001691 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001692 *ucs4 = (Py_UCS4)*str;
1693
1694 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001695 }
1696 }
1697}
1698
1699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700static PyObject*
1701get_latin1_char(unsigned char ch)
1702{
Victor Stinnera464fc12011-10-02 20:39:30 +02001703 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001705 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 if (!unicode)
1707 return NULL;
1708 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001709 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 unicode_latin1[ch] = unicode;
1711 }
1712 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001713 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714}
1715
Alexander Belopolsky40018472011-02-26 01:02:56 +00001716PyObject *
1717PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001719 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720 Py_UCS4 maxchar = 0;
1721 Py_ssize_t num_surrogates;
1722
1723 if (u == NULL)
1724 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001726 /* If the Unicode data is known at construction time, we can apply
1727 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 /* Optimization for empty strings */
1730 if (size == 0 && unicode_empty != NULL) {
1731 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001732 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001733 }
Tim Petersced69f82003-09-16 20:30:58 +00001734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 /* Single character Unicode objects in the Latin-1 range are
1736 shared when using this constructor */
1737 if (size == 1 && *u < 256)
1738 return get_latin1_char((unsigned char)*u);
1739
1740 /* If not empty and not single character, copy the Unicode data
1741 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001742 if (find_maxchar_surrogates(u, u + size,
1743 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 return NULL;
1745
Victor Stinner8faf8212011-12-08 22:14:11 +01001746 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 if (!unicode)
1748 return NULL;
1749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 switch (PyUnicode_KIND(unicode)) {
1751 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001752 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1754 break;
1755 case PyUnicode_2BYTE_KIND:
1756#if Py_UNICODE_SIZE == 2
1757 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1758#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001759 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1761#endif
1762 break;
1763 case PyUnicode_4BYTE_KIND:
1764#if SIZEOF_WCHAR_T == 2
1765 /* This is the only case which has to process surrogates, thus
1766 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001767 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768#else
1769 assert(num_surrogates == 0);
1770 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1771#endif
1772 break;
1773 default:
1774 assert(0 && "Impossible state");
1775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001777 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778}
1779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780PyObject *
1781PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001782{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001783 if (size < 0) {
1784 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001786 return NULL;
1787 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001788 if (u != NULL)
1789 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1790 else
1791 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001792}
1793
Alexander Belopolsky40018472011-02-26 01:02:56 +00001794PyObject *
1795PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001796{
1797 size_t size = strlen(u);
1798 if (size > PY_SSIZE_T_MAX) {
1799 PyErr_SetString(PyExc_OverflowError, "input too long");
1800 return NULL;
1801 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001802 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001803}
1804
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001805PyObject *
1806_PyUnicode_FromId(_Py_Identifier *id)
1807{
1808 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001809 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1810 strlen(id->string),
1811 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001812 if (!id->object)
1813 return NULL;
1814 PyUnicode_InternInPlace(&id->object);
1815 assert(!id->next);
1816 id->next = static_strings;
1817 static_strings = id;
1818 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001819 return id->object;
1820}
1821
1822void
1823_PyUnicode_ClearStaticStrings()
1824{
1825 _Py_Identifier *i;
1826 for (i = static_strings; i; i = i->next) {
1827 Py_DECREF(i->object);
1828 i->object = NULL;
1829 i->next = NULL;
1830 }
1831}
1832
Benjamin Peterson0df54292012-03-26 14:50:32 -04001833/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834
Victor Stinnerd3f08822012-05-29 12:57:52 +02001835PyObject*
1836_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001837{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001838 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001839 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001840 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001841#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001842 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001843#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001844 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001845 }
Victor Stinner785938e2011-12-11 20:09:03 +01001846 unicode = PyUnicode_New(size, 127);
1847 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001848 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001849 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1850 assert(_PyUnicode_CheckConsistency(unicode, 1));
1851 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001852}
1853
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001854static Py_UCS4
1855kind_maxchar_limit(unsigned int kind)
1856{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001857 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001858 case PyUnicode_1BYTE_KIND:
1859 return 0x80;
1860 case PyUnicode_2BYTE_KIND:
1861 return 0x100;
1862 case PyUnicode_4BYTE_KIND:
1863 return 0x10000;
1864 default:
1865 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001866 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001867 }
1868}
1869
Victor Stinnere6abb482012-05-02 01:15:40 +02001870Py_LOCAL_INLINE(Py_UCS4)
1871align_maxchar(Py_UCS4 maxchar)
1872{
1873 if (maxchar <= 127)
1874 return 127;
1875 else if (maxchar <= 255)
1876 return 255;
1877 else if (maxchar <= 65535)
1878 return 65535;
1879 else
1880 return MAX_UNICODE;
1881}
1882
Victor Stinner702c7342011-10-05 13:50:52 +02001883static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001884_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001887 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001888
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001889 if (size == 0) {
1890 Py_INCREF(unicode_empty);
1891 return unicode_empty;
1892 }
1893 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001894 if (size == 1)
1895 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001896
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001897 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001898 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001899 if (!res)
1900 return NULL;
1901 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001902 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001904}
1905
Victor Stinnere57b1c02011-09-28 22:20:48 +02001906static PyObject*
1907_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908{
1909 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001910 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001911
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001912 if (size == 0) {
1913 Py_INCREF(unicode_empty);
1914 return unicode_empty;
1915 }
1916 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001917 if (size == 1) {
1918 Py_UCS4 ch = u[0];
1919 if (ch < 256)
1920 return get_latin1_char((unsigned char)ch);
1921
1922 res = PyUnicode_New(1, ch);
1923 if (res == NULL)
1924 return NULL;
1925 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1926 assert(_PyUnicode_CheckConsistency(res, 1));
1927 return res;
1928 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001929
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001930 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001931 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 if (!res)
1933 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001934 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001936 else {
1937 _PyUnicode_CONVERT_BYTES(
1938 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1939 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001940 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941 return res;
1942}
1943
Victor Stinnere57b1c02011-09-28 22:20:48 +02001944static PyObject*
1945_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946{
1947 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001948 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001949
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001950 if (size == 0) {
1951 Py_INCREF(unicode_empty);
1952 return unicode_empty;
1953 }
1954 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001955 if (size == 1) {
1956 Py_UCS4 ch = u[0];
1957 if (ch < 256)
1958 return get_latin1_char((unsigned char)ch);
1959
1960 res = PyUnicode_New(1, ch);
1961 if (res == NULL)
1962 return NULL;
1963 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1964 assert(_PyUnicode_CheckConsistency(res, 1));
1965 return res;
1966 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001967
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001968 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001969 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001970 if (!res)
1971 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001972 if (max_char < 256)
1973 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1974 PyUnicode_1BYTE_DATA(res));
1975 else if (max_char < 0x10000)
1976 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1977 PyUnicode_2BYTE_DATA(res));
1978 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 return res;
1982}
1983
1984PyObject*
1985PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1986{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001987 if (size < 0) {
1988 PyErr_SetString(PyExc_ValueError, "size must be positive");
1989 return NULL;
1990 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001991 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001993 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001995 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001997 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001998 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001999 PyErr_SetString(PyExc_SystemError, "invalid kind");
2000 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002}
2003
Victor Stinnerece58de2012-04-23 23:36:38 +02002004Py_UCS4
2005_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2006{
2007 enum PyUnicode_Kind kind;
2008 void *startptr, *endptr;
2009
2010 assert(PyUnicode_IS_READY(unicode));
2011 assert(0 <= start);
2012 assert(end <= PyUnicode_GET_LENGTH(unicode));
2013 assert(start <= end);
2014
2015 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2016 return PyUnicode_MAX_CHAR_VALUE(unicode);
2017
2018 if (start == end)
2019 return 127;
2020
Victor Stinner94d558b2012-04-27 22:26:58 +02002021 if (PyUnicode_IS_ASCII(unicode))
2022 return 127;
2023
Victor Stinnerece58de2012-04-23 23:36:38 +02002024 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002025 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002026 endptr = (char *)startptr + end * kind;
2027 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002028 switch(kind) {
2029 case PyUnicode_1BYTE_KIND:
2030 return ucs1lib_find_max_char(startptr, endptr);
2031 case PyUnicode_2BYTE_KIND:
2032 return ucs2lib_find_max_char(startptr, endptr);
2033 case PyUnicode_4BYTE_KIND:
2034 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002035 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002036 assert(0);
2037 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002038 }
2039}
2040
Victor Stinner25a4b292011-10-06 12:31:55 +02002041/* Ensure that a string uses the most efficient storage, if it is not the
2042 case: create a new string with of the right kind. Write NULL into *p_unicode
2043 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002044static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002045unicode_adjust_maxchar(PyObject **p_unicode)
2046{
2047 PyObject *unicode, *copy;
2048 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002049 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002050 unsigned int kind;
2051
2052 assert(p_unicode != NULL);
2053 unicode = *p_unicode;
2054 assert(PyUnicode_IS_READY(unicode));
2055 if (PyUnicode_IS_ASCII(unicode))
2056 return;
2057
2058 len = PyUnicode_GET_LENGTH(unicode);
2059 kind = PyUnicode_KIND(unicode);
2060 if (kind == PyUnicode_1BYTE_KIND) {
2061 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002062 max_char = ucs1lib_find_max_char(u, u + len);
2063 if (max_char >= 128)
2064 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002065 }
2066 else if (kind == PyUnicode_2BYTE_KIND) {
2067 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002068 max_char = ucs2lib_find_max_char(u, u + len);
2069 if (max_char >= 256)
2070 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002071 }
2072 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002073 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002074 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002075 max_char = ucs4lib_find_max_char(u, u + len);
2076 if (max_char >= 0x10000)
2077 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002078 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002079 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002080 if (copy != NULL)
2081 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002082 Py_DECREF(unicode);
2083 *p_unicode = copy;
2084}
2085
Victor Stinner034f6cf2011-09-30 02:26:44 +02002086PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002087_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002088{
Victor Stinner87af4f22011-11-21 23:03:47 +01002089 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002090 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002091
Victor Stinner034f6cf2011-09-30 02:26:44 +02002092 if (!PyUnicode_Check(unicode)) {
2093 PyErr_BadInternalCall();
2094 return NULL;
2095 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002096 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002097 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002098
Victor Stinner87af4f22011-11-21 23:03:47 +01002099 length = PyUnicode_GET_LENGTH(unicode);
2100 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002101 if (!copy)
2102 return NULL;
2103 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2104
Victor Stinner87af4f22011-11-21 23:03:47 +01002105 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2106 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002107 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002108 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002109}
2110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002111
Victor Stinnerbc603d12011-10-02 01:00:40 +02002112/* Widen Unicode objects to larger buffers. Don't write terminating null
2113 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002114
2115void*
2116_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2117{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002118 Py_ssize_t len;
2119 void *result;
2120 unsigned int skind;
2121
Benjamin Petersonbac79492012-01-14 13:34:47 -05002122 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002123 return NULL;
2124
2125 len = PyUnicode_GET_LENGTH(s);
2126 skind = PyUnicode_KIND(s);
2127 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002128 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 return NULL;
2130 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002131 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002132 case PyUnicode_2BYTE_KIND:
2133 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2134 if (!result)
2135 return PyErr_NoMemory();
2136 assert(skind == PyUnicode_1BYTE_KIND);
2137 _PyUnicode_CONVERT_BYTES(
2138 Py_UCS1, Py_UCS2,
2139 PyUnicode_1BYTE_DATA(s),
2140 PyUnicode_1BYTE_DATA(s) + len,
2141 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002143 case PyUnicode_4BYTE_KIND:
2144 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2145 if (!result)
2146 return PyErr_NoMemory();
2147 if (skind == PyUnicode_2BYTE_KIND) {
2148 _PyUnicode_CONVERT_BYTES(
2149 Py_UCS2, Py_UCS4,
2150 PyUnicode_2BYTE_DATA(s),
2151 PyUnicode_2BYTE_DATA(s) + len,
2152 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002154 else {
2155 assert(skind == PyUnicode_1BYTE_KIND);
2156 _PyUnicode_CONVERT_BYTES(
2157 Py_UCS1, Py_UCS4,
2158 PyUnicode_1BYTE_DATA(s),
2159 PyUnicode_1BYTE_DATA(s) + len,
2160 result);
2161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163 default:
2164 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 }
Victor Stinner01698042011-10-04 00:04:26 +02002166 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167 return NULL;
2168}
2169
2170static Py_UCS4*
2171as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2172 int copy_null)
2173{
2174 int kind;
2175 void *data;
2176 Py_ssize_t len, targetlen;
2177 if (PyUnicode_READY(string) == -1)
2178 return NULL;
2179 kind = PyUnicode_KIND(string);
2180 data = PyUnicode_DATA(string);
2181 len = PyUnicode_GET_LENGTH(string);
2182 targetlen = len;
2183 if (copy_null)
2184 targetlen++;
2185 if (!target) {
2186 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2187 PyErr_NoMemory();
2188 return NULL;
2189 }
2190 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2191 if (!target) {
2192 PyErr_NoMemory();
2193 return NULL;
2194 }
2195 }
2196 else {
2197 if (targetsize < targetlen) {
2198 PyErr_Format(PyExc_SystemError,
2199 "string is longer than the buffer");
2200 if (copy_null && 0 < targetsize)
2201 target[0] = 0;
2202 return NULL;
2203 }
2204 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002205 if (kind == PyUnicode_1BYTE_KIND) {
2206 Py_UCS1 *start = (Py_UCS1 *) data;
2207 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002209 else if (kind == PyUnicode_2BYTE_KIND) {
2210 Py_UCS2 *start = (Py_UCS2 *) data;
2211 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2212 }
2213 else {
2214 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 if (copy_null)
2218 target[len] = 0;
2219 return target;
2220}
2221
2222Py_UCS4*
2223PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2224 int copy_null)
2225{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002226 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 PyErr_BadInternalCall();
2228 return NULL;
2229 }
2230 return as_ucs4(string, target, targetsize, copy_null);
2231}
2232
2233Py_UCS4*
2234PyUnicode_AsUCS4Copy(PyObject *string)
2235{
2236 return as_ucs4(string, NULL, 0, 1);
2237}
2238
2239#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002240
Alexander Belopolsky40018472011-02-26 01:02:56 +00002241PyObject *
2242PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002245 if (size == 0) {
2246 Py_INCREF(unicode_empty);
2247 return unicode_empty;
2248 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002249 PyErr_BadInternalCall();
2250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 }
2252
Martin v. Löwis790465f2008-04-05 20:41:37 +00002253 if (size == -1) {
2254 size = wcslen(w);
2255 }
2256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258}
2259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002261
Walter Dörwald346737f2007-05-31 10:44:43 +00002262static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002263makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002264 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002265{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002267 if (longflag)
2268 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002269 else if (longlongflag) {
2270 /* longlongflag should only ever be nonzero on machines with
2271 HAVE_LONG_LONG defined */
2272#ifdef HAVE_LONG_LONG
2273 char *f = PY_FORMAT_LONG_LONG;
2274 while (*f)
2275 *fmt++ = *f++;
2276#else
2277 /* we shouldn't ever get here */
2278 assert(0);
2279 *fmt++ = 'l';
2280#endif
2281 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 else if (size_tflag) {
2283 char *f = PY_FORMAT_SIZE_T;
2284 while (*f)
2285 *fmt++ = *f++;
2286 }
2287 *fmt++ = c;
2288 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002289}
2290
Victor Stinner15a11362012-10-06 23:48:20 +02002291/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002292 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2293 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2294#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002295
2296static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002297unicode_fromformat_arg(_PyUnicodeWriter *writer,
2298 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002299{
Victor Stinnere215d962012-10-06 23:03:36 +02002300 const char *p;
2301 Py_ssize_t len;
2302 int zeropad;
2303 int width;
2304 int precision;
2305 int longflag;
2306 int longlongflag;
2307 int size_tflag;
2308 int fill;
2309
2310 p = f;
2311 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002312 zeropad = 0;
2313 if (*f == '0') {
2314 zeropad = 1;
2315 f++;
2316 }
Victor Stinner96865452011-03-01 23:44:09 +00002317
2318 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002319 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002320 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002321 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2322 PyErr_SetString(PyExc_ValueError,
2323 "width too big");
2324 return NULL;
2325 }
Victor Stinnere215d962012-10-06 23:03:36 +02002326 width = (width*10) + (*f - '0');
2327 f++;
2328 }
Victor Stinner96865452011-03-01 23:44:09 +00002329 precision = 0;
2330 if (*f == '.') {
2331 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002332 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002333 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2334 PyErr_SetString(PyExc_ValueError,
2335 "precision too big");
2336 return NULL;
2337 }
Victor Stinnere215d962012-10-06 23:03:36 +02002338 precision = (precision*10) + (*f - '0');
2339 f++;
2340 }
Victor Stinner96865452011-03-01 23:44:09 +00002341 if (*f == '%') {
2342 /* "%.3%s" => f points to "3" */
2343 f--;
2344 }
2345 }
2346 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002347 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002348 f--;
2349 }
Victor Stinner96865452011-03-01 23:44:09 +00002350
2351 /* Handle %ld, %lu, %lld and %llu. */
2352 longflag = 0;
2353 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002354 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002355 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002356 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002357 longflag = 1;
2358 ++f;
2359 }
2360#ifdef HAVE_LONG_LONG
2361 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002362 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002363 longlongflag = 1;
2364 f += 2;
2365 }
2366#endif
2367 }
2368 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002369 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002370 size_tflag = 1;
2371 ++f;
2372 }
Victor Stinnere215d962012-10-06 23:03:36 +02002373
2374 if (f[1] == '\0')
2375 writer->overallocate = 0;
2376
2377 switch (*f) {
2378 case 'c':
2379 {
2380 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002381 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2382 PyErr_SetString(PyExc_ValueError,
2383 "character argument not in range(0x110000)");
2384 return NULL;
2385 }
Victor Stinnere215d962012-10-06 23:03:36 +02002386 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2387 return NULL;
2388 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2389 writer->pos++;
2390 break;
2391 }
2392
2393 case 'i':
2394 case 'd':
2395 case 'u':
2396 case 'x':
2397 {
2398 /* used by sprintf */
2399 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002400 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002401
2402 if (*f == 'u') {
2403 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2404
2405 if (longflag)
2406 len = sprintf(buffer, fmt,
2407 va_arg(*vargs, unsigned long));
2408#ifdef HAVE_LONG_LONG
2409 else if (longlongflag)
2410 len = sprintf(buffer, fmt,
2411 va_arg(*vargs, unsigned PY_LONG_LONG));
2412#endif
2413 else if (size_tflag)
2414 len = sprintf(buffer, fmt,
2415 va_arg(*vargs, size_t));
2416 else
2417 len = sprintf(buffer, fmt,
2418 va_arg(*vargs, unsigned int));
2419 }
2420 else if (*f == 'x') {
2421 makefmt(fmt, 0, 0, 0, 'x');
2422 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2423 }
2424 else {
2425 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2426
2427 if (longflag)
2428 len = sprintf(buffer, fmt,
2429 va_arg(*vargs, long));
2430#ifdef HAVE_LONG_LONG
2431 else if (longlongflag)
2432 len = sprintf(buffer, fmt,
2433 va_arg(*vargs, PY_LONG_LONG));
2434#endif
2435 else if (size_tflag)
2436 len = sprintf(buffer, fmt,
2437 va_arg(*vargs, Py_ssize_t));
2438 else
2439 len = sprintf(buffer, fmt,
2440 va_arg(*vargs, int));
2441 }
2442 assert(len >= 0);
2443
Victor Stinnere215d962012-10-06 23:03:36 +02002444 if (precision < len)
2445 precision = len;
2446 if (width > precision) {
2447 Py_UCS4 fillchar;
2448 fill = width - precision;
2449 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002450 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2451 return NULL;
2452 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2453 return NULL;
2454 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002455 }
Victor Stinner15a11362012-10-06 23:48:20 +02002456 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002457 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002458 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2459 return NULL;
2460 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2461 return NULL;
2462 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002463 }
Victor Stinner15a11362012-10-06 23:48:20 +02002464 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002465 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002466 break;
2467 }
2468
2469 case 'p':
2470 {
2471 char number[MAX_LONG_LONG_CHARS];
2472
2473 len = sprintf(number, "%p", va_arg(*vargs, void*));
2474 assert(len >= 0);
2475
2476 /* %p is ill-defined: ensure leading 0x. */
2477 if (number[1] == 'X')
2478 number[1] = 'x';
2479 else if (number[1] != 'x') {
2480 memmove(number + 2, number,
2481 strlen(number) + 1);
2482 number[0] = '0';
2483 number[1] = 'x';
2484 len += 2;
2485 }
2486
2487 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2488 return NULL;
2489 break;
2490 }
2491
2492 case 's':
2493 {
2494 /* UTF-8 */
2495 const char *s = va_arg(*vargs, const char*);
2496 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2497 if (!str)
2498 return NULL;
2499 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2500 Py_DECREF(str);
2501 return NULL;
2502 }
2503 Py_DECREF(str);
2504 break;
2505 }
2506
2507 case 'U':
2508 {
2509 PyObject *obj = va_arg(*vargs, PyObject *);
2510 assert(obj && _PyUnicode_CHECK(obj));
2511
2512 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2513 return NULL;
2514 break;
2515 }
2516
2517 case 'V':
2518 {
2519 PyObject *obj = va_arg(*vargs, PyObject *);
2520 const char *str = va_arg(*vargs, const char *);
2521 PyObject *str_obj;
2522 assert(obj || str);
2523 if (obj) {
2524 assert(_PyUnicode_CHECK(obj));
2525 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2526 return NULL;
2527 }
2528 else {
2529 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2530 if (!str_obj)
2531 return NULL;
2532 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2533 Py_DECREF(str_obj);
2534 return NULL;
2535 }
2536 Py_DECREF(str_obj);
2537 }
2538 break;
2539 }
2540
2541 case 'S':
2542 {
2543 PyObject *obj = va_arg(*vargs, PyObject *);
2544 PyObject *str;
2545 assert(obj);
2546 str = PyObject_Str(obj);
2547 if (!str)
2548 return NULL;
2549 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2550 Py_DECREF(str);
2551 return NULL;
2552 }
2553 Py_DECREF(str);
2554 break;
2555 }
2556
2557 case 'R':
2558 {
2559 PyObject *obj = va_arg(*vargs, PyObject *);
2560 PyObject *repr;
2561 assert(obj);
2562 repr = PyObject_Repr(obj);
2563 if (!repr)
2564 return NULL;
2565 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2566 Py_DECREF(repr);
2567 return NULL;
2568 }
2569 Py_DECREF(repr);
2570 break;
2571 }
2572
2573 case 'A':
2574 {
2575 PyObject *obj = va_arg(*vargs, PyObject *);
2576 PyObject *ascii;
2577 assert(obj);
2578 ascii = PyObject_ASCII(obj);
2579 if (!ascii)
2580 return NULL;
2581 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2582 Py_DECREF(ascii);
2583 return NULL;
2584 }
2585 Py_DECREF(ascii);
2586 break;
2587 }
2588
2589 case '%':
2590 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2591 return NULL;
2592 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2593 writer->pos++;
2594 break;
2595
2596 default:
2597 /* if we stumble upon an unknown formatting code, copy the rest
2598 of the format string to the output string. (we cannot just
2599 skip the code, since there's no way to know what's in the
2600 argument list) */
2601 len = strlen(p);
2602 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2603 return NULL;
2604 f = p+len;
2605 return f;
2606 }
2607
2608 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002609 return f;
2610}
2611
Walter Dörwaldd2034312007-05-18 16:29:38 +00002612PyObject *
2613PyUnicode_FromFormatV(const char *format, va_list vargs)
2614{
Victor Stinnere215d962012-10-06 23:03:36 +02002615 va_list vargs2;
2616 const char *f;
2617 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002618
Victor Stinnere215d962012-10-06 23:03:36 +02002619 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2620
2621 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2622 Copy it to be able to pass a reference to a subfunction. */
2623 Py_VA_COPY(vargs2, vargs);
2624
2625 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002627 f = unicode_fromformat_arg(&writer, f, &vargs2);
2628 if (f == NULL)
2629 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002632 const char *p;
2633 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002634
Victor Stinnere215d962012-10-06 23:03:36 +02002635 p = f;
2636 do
2637 {
2638 if ((unsigned char)*p > 127) {
2639 PyErr_Format(PyExc_ValueError,
2640 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2641 "string, got a non-ASCII byte: 0x%02x",
2642 (unsigned char)*p);
2643 return NULL;
2644 }
2645 p++;
2646 }
2647 while (*p != '\0' && *p != '%');
2648 len = p - f;
2649
2650 if (*p == '\0')
2651 writer.overallocate = 0;
2652 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2653 goto fail;
2654 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2655 writer.pos += len;
2656
2657 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002658 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 }
Victor Stinnere215d962012-10-06 23:03:36 +02002660 return _PyUnicodeWriter_Finish(&writer);
2661
2662 fail:
2663 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002664 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002665}
2666
Walter Dörwaldd2034312007-05-18 16:29:38 +00002667PyObject *
2668PyUnicode_FromFormat(const char *format, ...)
2669{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 PyObject* ret;
2671 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002672
2673#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002675#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002676 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002677#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 ret = PyUnicode_FromFormatV(format, vargs);
2679 va_end(vargs);
2680 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002681}
2682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683#ifdef HAVE_WCHAR_H
2684
Victor Stinner5593d8a2010-10-02 11:11:27 +00002685/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2686 convert a Unicode object to a wide character string.
2687
Victor Stinnerd88d9832011-09-06 02:00:05 +02002688 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002689 character) required to convert the unicode object. Ignore size argument.
2690
Victor Stinnerd88d9832011-09-06 02:00:05 +02002691 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002692 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002693 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002694static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002695unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002696 wchar_t *w,
2697 Py_ssize_t size)
2698{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002699 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 const wchar_t *wstr;
2701
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002702 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002703 if (wstr == NULL)
2704 return -1;
2705
Victor Stinner5593d8a2010-10-02 11:11:27 +00002706 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002707 if (size > res)
2708 size = res + 1;
2709 else
2710 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002712 return res;
2713 }
2714 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002716}
2717
2718Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002719PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002720 wchar_t *w,
2721 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722{
2723 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002724 PyErr_BadInternalCall();
2725 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002727 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728}
2729
Victor Stinner137c34c2010-09-29 10:25:54 +00002730wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002731PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002732 Py_ssize_t *size)
2733{
2734 wchar_t* buffer;
2735 Py_ssize_t buflen;
2736
2737 if (unicode == NULL) {
2738 PyErr_BadInternalCall();
2739 return NULL;
2740 }
2741
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002742 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 if (buflen == -1)
2744 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002745 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002746 PyErr_NoMemory();
2747 return NULL;
2748 }
2749
Victor Stinner137c34c2010-09-29 10:25:54 +00002750 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2751 if (buffer == NULL) {
2752 PyErr_NoMemory();
2753 return NULL;
2754 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002755 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002756 if (buflen == -1) {
2757 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002758 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002759 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002760 if (size != NULL)
2761 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002762 return buffer;
2763}
2764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002765#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766
Alexander Belopolsky40018472011-02-26 01:02:56 +00002767PyObject *
2768PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002771 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002772 PyErr_SetString(PyExc_ValueError,
2773 "chr() arg not in range(0x110000)");
2774 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002775 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002777 if (ordinal < 256)
2778 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 v = PyUnicode_New(1, ordinal);
2781 if (v == NULL)
2782 return NULL;
2783 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002784 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002785 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002786}
2787
Alexander Belopolsky40018472011-02-26 01:02:56 +00002788PyObject *
2789PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002791 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002792 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002793 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002794 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002795 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002796 Py_INCREF(obj);
2797 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002798 }
2799 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002800 /* For a Unicode subtype that's not a Unicode object,
2801 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002802 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002803 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002804 PyErr_Format(PyExc_TypeError,
2805 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002806 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002807 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002808}
2809
Alexander Belopolsky40018472011-02-26 01:02:56 +00002810PyObject *
2811PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002812 const char *encoding,
2813 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002814{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002815 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002816 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002817
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002819 PyErr_BadInternalCall();
2820 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002822
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002823 /* Decoding bytes objects is the most common case and should be fast */
2824 if (PyBytes_Check(obj)) {
2825 if (PyBytes_GET_SIZE(obj) == 0) {
2826 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002827 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002828 }
2829 else {
2830 v = PyUnicode_Decode(
2831 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2832 encoding, errors);
2833 }
2834 return v;
2835 }
2836
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002837 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002838 PyErr_SetString(PyExc_TypeError,
2839 "decoding str is not supported");
2840 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002841 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002842
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002843 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2844 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2845 PyErr_Format(PyExc_TypeError,
2846 "coercing to str: need bytes, bytearray "
2847 "or buffer-like object, %.80s found",
2848 Py_TYPE(obj)->tp_name);
2849 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002850 }
Tim Petersced69f82003-09-16 20:30:58 +00002851
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002852 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002854 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 }
Tim Petersced69f82003-09-16 20:30:58 +00002856 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002857 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002858
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002859 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002860 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861}
2862
Victor Stinner600d3be2010-06-10 12:00:55 +00002863/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002864 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2865 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002866int
2867_Py_normalize_encoding(const char *encoding,
2868 char *lower,
2869 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002871 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002872 char *l;
2873 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002874
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002875 if (encoding == NULL) {
2876 strcpy(lower, "utf-8");
2877 return 1;
2878 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002879 e = encoding;
2880 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002881 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002882 while (*e) {
2883 if (l == l_end)
2884 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002885 if (Py_ISUPPER(*e)) {
2886 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002887 }
2888 else if (*e == '_') {
2889 *l++ = '-';
2890 e++;
2891 }
2892 else {
2893 *l++ = *e++;
2894 }
2895 }
2896 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002897 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002898}
2899
Alexander Belopolsky40018472011-02-26 01:02:56 +00002900PyObject *
2901PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002902 Py_ssize_t size,
2903 const char *encoding,
2904 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002905{
2906 PyObject *buffer = NULL, *unicode;
2907 Py_buffer info;
2908 char lower[11]; /* Enough for any encoding shortcut */
2909
Fred Drakee4315f52000-05-09 19:53:39 +00002910 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002911 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002912 if ((strcmp(lower, "utf-8") == 0) ||
2913 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002914 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002915 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002916 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002917 (strcmp(lower, "iso-8859-1") == 0))
2918 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002919#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002920 else if (strcmp(lower, "mbcs") == 0)
2921 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002922#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002923 else if (strcmp(lower, "ascii") == 0)
2924 return PyUnicode_DecodeASCII(s, size, errors);
2925 else if (strcmp(lower, "utf-16") == 0)
2926 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2927 else if (strcmp(lower, "utf-32") == 0)
2928 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930
2931 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002932 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002933 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002934 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002935 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 if (buffer == NULL)
2937 goto onError;
2938 unicode = PyCodec_Decode(buffer, encoding, errors);
2939 if (unicode == NULL)
2940 goto onError;
2941 if (!PyUnicode_Check(unicode)) {
2942 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002943 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002944 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945 Py_DECREF(unicode);
2946 goto onError;
2947 }
2948 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002949 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002950
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952 Py_XDECREF(buffer);
2953 return NULL;
2954}
2955
Alexander Belopolsky40018472011-02-26 01:02:56 +00002956PyObject *
2957PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002958 const char *encoding,
2959 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002960{
2961 PyObject *v;
2962
2963 if (!PyUnicode_Check(unicode)) {
2964 PyErr_BadArgument();
2965 goto onError;
2966 }
2967
2968 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002969 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002970
2971 /* Decode via the codec registry */
2972 v = PyCodec_Decode(unicode, encoding, errors);
2973 if (v == NULL)
2974 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002975 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002976
Benjamin Peterson29060642009-01-31 22:14:21 +00002977 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002978 return NULL;
2979}
2980
Alexander Belopolsky40018472011-02-26 01:02:56 +00002981PyObject *
2982PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002983 const char *encoding,
2984 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002985{
2986 PyObject *v;
2987
2988 if (!PyUnicode_Check(unicode)) {
2989 PyErr_BadArgument();
2990 goto onError;
2991 }
2992
2993 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002994 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002995
2996 /* Decode via the codec registry */
2997 v = PyCodec_Decode(unicode, encoding, errors);
2998 if (v == NULL)
2999 goto onError;
3000 if (!PyUnicode_Check(v)) {
3001 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003002 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003003 Py_TYPE(v)->tp_name);
3004 Py_DECREF(v);
3005 goto onError;
3006 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003007 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003008
Benjamin Peterson29060642009-01-31 22:14:21 +00003009 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003010 return NULL;
3011}
3012
Alexander Belopolsky40018472011-02-26 01:02:56 +00003013PyObject *
3014PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003015 Py_ssize_t size,
3016 const char *encoding,
3017 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018{
3019 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003020
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021 unicode = PyUnicode_FromUnicode(s, size);
3022 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3025 Py_DECREF(unicode);
3026 return v;
3027}
3028
Alexander Belopolsky40018472011-02-26 01:02:56 +00003029PyObject *
3030PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003031 const char *encoding,
3032 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003033{
3034 PyObject *v;
3035
3036 if (!PyUnicode_Check(unicode)) {
3037 PyErr_BadArgument();
3038 goto onError;
3039 }
3040
3041 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003043
3044 /* Encode via the codec registry */
3045 v = PyCodec_Encode(unicode, encoding, errors);
3046 if (v == NULL)
3047 goto onError;
3048 return v;
3049
Benjamin Peterson29060642009-01-31 22:14:21 +00003050 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003051 return NULL;
3052}
3053
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003054static size_t
3055wcstombs_errorpos(const wchar_t *wstr)
3056{
3057 size_t len;
3058#if SIZEOF_WCHAR_T == 2
3059 wchar_t buf[3];
3060#else
3061 wchar_t buf[2];
3062#endif
3063 char outbuf[MB_LEN_MAX];
3064 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003065
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003066#if SIZEOF_WCHAR_T == 2
3067 buf[2] = 0;
3068#else
3069 buf[1] = 0;
3070#endif
3071 start = wstr;
3072 while (*wstr != L'\0')
3073 {
3074 previous = wstr;
3075#if SIZEOF_WCHAR_T == 2
3076 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3077 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3078 {
3079 buf[0] = wstr[0];
3080 buf[1] = wstr[1];
3081 wstr += 2;
3082 }
3083 else {
3084 buf[0] = *wstr;
3085 buf[1] = 0;
3086 wstr++;
3087 }
3088#else
3089 buf[0] = *wstr;
3090 wstr++;
3091#endif
3092 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003093 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003094 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003095 }
3096
3097 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003098 return 0;
3099}
3100
Victor Stinner1b579672011-12-17 05:47:23 +01003101static int
3102locale_error_handler(const char *errors, int *surrogateescape)
3103{
3104 if (errors == NULL) {
3105 *surrogateescape = 0;
3106 return 0;
3107 }
3108
3109 if (strcmp(errors, "strict") == 0) {
3110 *surrogateescape = 0;
3111 return 0;
3112 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003113 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003114 *surrogateescape = 1;
3115 return 0;
3116 }
3117 PyErr_Format(PyExc_ValueError,
3118 "only 'strict' and 'surrogateescape' error handlers "
3119 "are supported, not '%s'",
3120 errors);
3121 return -1;
3122}
3123
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003124PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003125PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003126{
3127 Py_ssize_t wlen, wlen2;
3128 wchar_t *wstr;
3129 PyObject *bytes = NULL;
3130 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003131 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003132 PyObject *exc;
3133 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003134 int surrogateescape;
3135
3136 if (locale_error_handler(errors, &surrogateescape) < 0)
3137 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003138
3139 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3140 if (wstr == NULL)
3141 return NULL;
3142
3143 wlen2 = wcslen(wstr);
3144 if (wlen2 != wlen) {
3145 PyMem_Free(wstr);
3146 PyErr_SetString(PyExc_TypeError, "embedded null character");
3147 return NULL;
3148 }
3149
3150 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003151 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003152 char *str;
3153
3154 str = _Py_wchar2char(wstr, &error_pos);
3155 if (str == NULL) {
3156 if (error_pos == (size_t)-1) {
3157 PyErr_NoMemory();
3158 PyMem_Free(wstr);
3159 return NULL;
3160 }
3161 else {
3162 goto encode_error;
3163 }
3164 }
3165 PyMem_Free(wstr);
3166
3167 bytes = PyBytes_FromString(str);
3168 PyMem_Free(str);
3169 }
3170 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003171 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003172 size_t len, len2;
3173
3174 len = wcstombs(NULL, wstr, 0);
3175 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003176 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003177 goto encode_error;
3178 }
3179
3180 bytes = PyBytes_FromStringAndSize(NULL, len);
3181 if (bytes == NULL) {
3182 PyMem_Free(wstr);
3183 return NULL;
3184 }
3185
3186 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3187 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003188 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003189 goto encode_error;
3190 }
3191 PyMem_Free(wstr);
3192 }
3193 return bytes;
3194
3195encode_error:
3196 errmsg = strerror(errno);
3197 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003198
3199 if (error_pos == (size_t)-1)
3200 error_pos = wcstombs_errorpos(wstr);
3201
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003202 PyMem_Free(wstr);
3203 Py_XDECREF(bytes);
3204
Victor Stinner2f197072011-12-17 07:08:30 +01003205 if (errmsg != NULL) {
3206 size_t errlen;
3207 wstr = _Py_char2wchar(errmsg, &errlen);
3208 if (wstr != NULL) {
3209 reason = PyUnicode_FromWideChar(wstr, errlen);
3210 PyMem_Free(wstr);
3211 } else
3212 errmsg = NULL;
3213 }
3214 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003215 reason = PyUnicode_FromString(
3216 "wcstombs() encountered an unencodable "
3217 "wide character");
3218 if (reason == NULL)
3219 return NULL;
3220
3221 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3222 "locale", unicode,
3223 (Py_ssize_t)error_pos,
3224 (Py_ssize_t)(error_pos+1),
3225 reason);
3226 Py_DECREF(reason);
3227 if (exc != NULL) {
3228 PyCodec_StrictErrors(exc);
3229 Py_XDECREF(exc);
3230 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003231 return NULL;
3232}
3233
Victor Stinnerad158722010-10-27 00:25:46 +00003234PyObject *
3235PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003236{
Victor Stinner99b95382011-07-04 14:23:54 +02003237#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003238 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003239#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003240 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003241#else
Victor Stinner793b5312011-04-27 00:24:21 +02003242 PyInterpreterState *interp = PyThreadState_GET()->interp;
3243 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3244 cannot use it to encode and decode filenames before it is loaded. Load
3245 the Python codec requires to encode at least its own filename. Use the C
3246 version of the locale codec until the codec registry is initialized and
3247 the Python codec is loaded.
3248
3249 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3250 cannot only rely on it: check also interp->fscodec_initialized for
3251 subinterpreters. */
3252 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003253 return PyUnicode_AsEncodedString(unicode,
3254 Py_FileSystemDefaultEncoding,
3255 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003256 }
3257 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003258 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003259 }
Victor Stinnerad158722010-10-27 00:25:46 +00003260#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003261}
3262
Alexander Belopolsky40018472011-02-26 01:02:56 +00003263PyObject *
3264PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003265 const char *encoding,
3266 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267{
3268 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003269 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003270
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 if (!PyUnicode_Check(unicode)) {
3272 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003274 }
Fred Drakee4315f52000-05-09 19:53:39 +00003275
Fred Drakee4315f52000-05-09 19:53:39 +00003276 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003277 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003278 if ((strcmp(lower, "utf-8") == 0) ||
3279 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003280 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003281 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003282 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003283 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003284 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003285 }
Victor Stinner37296e82010-06-10 13:36:23 +00003286 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003287 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003288 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003289 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003290#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003291 else if (strcmp(lower, "mbcs") == 0)
3292 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003293#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003294 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003295 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297
3298 /* Encode via the codec registry */
3299 v = PyCodec_Encode(unicode, encoding, errors);
3300 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003301 return NULL;
3302
3303 /* The normal path */
3304 if (PyBytes_Check(v))
3305 return v;
3306
3307 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003308 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003309 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003310 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003311
3312 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3313 "encoder %s returned bytearray instead of bytes",
3314 encoding);
3315 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003316 Py_DECREF(v);
3317 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003318 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003319
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003320 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3321 Py_DECREF(v);
3322 return b;
3323 }
3324
3325 PyErr_Format(PyExc_TypeError,
3326 "encoder did not return a bytes object (type=%.400s)",
3327 Py_TYPE(v)->tp_name);
3328 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003329 return NULL;
3330}
3331
Alexander Belopolsky40018472011-02-26 01:02:56 +00003332PyObject *
3333PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003334 const char *encoding,
3335 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003336{
3337 PyObject *v;
3338
3339 if (!PyUnicode_Check(unicode)) {
3340 PyErr_BadArgument();
3341 goto onError;
3342 }
3343
3344 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003345 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003346
3347 /* Encode via the codec registry */
3348 v = PyCodec_Encode(unicode, encoding, errors);
3349 if (v == NULL)
3350 goto onError;
3351 if (!PyUnicode_Check(v)) {
3352 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003353 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003354 Py_TYPE(v)->tp_name);
3355 Py_DECREF(v);
3356 goto onError;
3357 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003359
Benjamin Peterson29060642009-01-31 22:14:21 +00003360 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361 return NULL;
3362}
3363
Victor Stinner2f197072011-12-17 07:08:30 +01003364static size_t
3365mbstowcs_errorpos(const char *str, size_t len)
3366{
3367#ifdef HAVE_MBRTOWC
3368 const char *start = str;
3369 mbstate_t mbs;
3370 size_t converted;
3371 wchar_t ch;
3372
3373 memset(&mbs, 0, sizeof mbs);
3374 while (len)
3375 {
3376 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3377 if (converted == 0)
3378 /* Reached end of string */
3379 break;
3380 if (converted == (size_t)-1 || converted == (size_t)-2) {
3381 /* Conversion error or incomplete character */
3382 return str - start;
3383 }
3384 else {
3385 str += converted;
3386 len -= converted;
3387 }
3388 }
3389 /* failed to find the undecodable byte sequence */
3390 return 0;
3391#endif
3392 return 0;
3393}
3394
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003395PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003396PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003397 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003398{
3399 wchar_t smallbuf[256];
3400 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3401 wchar_t *wstr;
3402 size_t wlen, wlen2;
3403 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003404 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003405 size_t error_pos;
3406 char *errmsg;
3407 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003408
3409 if (locale_error_handler(errors, &surrogateescape) < 0)
3410 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003411
3412 if (str[len] != '\0' || len != strlen(str)) {
3413 PyErr_SetString(PyExc_TypeError, "embedded null character");
3414 return NULL;
3415 }
3416
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003417 if (surrogateescape) {
3418 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003419 wstr = _Py_char2wchar(str, &wlen);
3420 if (wstr == NULL) {
3421 if (wlen == (size_t)-1)
3422 PyErr_NoMemory();
3423 else
3424 PyErr_SetFromErrno(PyExc_OSError);
3425 return NULL;
3426 }
3427
3428 unicode = PyUnicode_FromWideChar(wstr, wlen);
3429 PyMem_Free(wstr);
3430 }
3431 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003432 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003433#ifndef HAVE_BROKEN_MBSTOWCS
3434 wlen = mbstowcs(NULL, str, 0);
3435#else
3436 wlen = len;
3437#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003438 if (wlen == (size_t)-1)
3439 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003440 if (wlen+1 <= smallbuf_len) {
3441 wstr = smallbuf;
3442 }
3443 else {
3444 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3445 return PyErr_NoMemory();
3446
3447 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3448 if (!wstr)
3449 return PyErr_NoMemory();
3450 }
3451
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003452 wlen2 = mbstowcs(wstr, str, wlen+1);
3453 if (wlen2 == (size_t)-1) {
3454 if (wstr != smallbuf)
3455 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003456 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003457 }
3458#ifdef HAVE_BROKEN_MBSTOWCS
3459 assert(wlen2 == wlen);
3460#endif
3461 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3462 if (wstr != smallbuf)
3463 PyMem_Free(wstr);
3464 }
3465 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003466
3467decode_error:
3468 errmsg = strerror(errno);
3469 assert(errmsg != NULL);
3470
3471 error_pos = mbstowcs_errorpos(str, len);
3472 if (errmsg != NULL) {
3473 size_t errlen;
3474 wstr = _Py_char2wchar(errmsg, &errlen);
3475 if (wstr != NULL) {
3476 reason = PyUnicode_FromWideChar(wstr, errlen);
3477 PyMem_Free(wstr);
3478 } else
3479 errmsg = NULL;
3480 }
3481 if (errmsg == NULL)
3482 reason = PyUnicode_FromString(
3483 "mbstowcs() encountered an invalid multibyte sequence");
3484 if (reason == NULL)
3485 return NULL;
3486
3487 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3488 "locale", str, len,
3489 (Py_ssize_t)error_pos,
3490 (Py_ssize_t)(error_pos+1),
3491 reason);
3492 Py_DECREF(reason);
3493 if (exc != NULL) {
3494 PyCodec_StrictErrors(exc);
3495 Py_XDECREF(exc);
3496 }
3497 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003498}
3499
3500PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003501PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003502{
3503 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003504 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003505}
3506
3507
3508PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003509PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003510 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003511 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3512}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003513
Christian Heimes5894ba72007-11-04 11:43:14 +00003514PyObject*
3515PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3516{
Victor Stinner99b95382011-07-04 14:23:54 +02003517#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003518 return PyUnicode_DecodeMBCS(s, size, NULL);
3519#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003520 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003521#else
Victor Stinner793b5312011-04-27 00:24:21 +02003522 PyInterpreterState *interp = PyThreadState_GET()->interp;
3523 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3524 cannot use it to encode and decode filenames before it is loaded. Load
3525 the Python codec requires to encode at least its own filename. Use the C
3526 version of the locale codec until the codec registry is initialized and
3527 the Python codec is loaded.
3528
3529 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3530 cannot only rely on it: check also interp->fscodec_initialized for
3531 subinterpreters. */
3532 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003533 return PyUnicode_Decode(s, size,
3534 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003535 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003536 }
3537 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003538 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003539 }
Victor Stinnerad158722010-10-27 00:25:46 +00003540#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003541}
3542
Martin v. Löwis011e8422009-05-05 04:43:17 +00003543
3544int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003545_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003546{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003547 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003548
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003549 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003550 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003551 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3552 PyUnicode_GET_LENGTH(str), '\0', 1);
3553 if (pos == -1)
3554 return 0;
3555 else
3556 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003557}
3558
Antoine Pitrou13348842012-01-29 18:36:34 +01003559int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003560PyUnicode_FSConverter(PyObject* arg, void* addr)
3561{
3562 PyObject *output = NULL;
3563 Py_ssize_t size;
3564 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003565 if (arg == NULL) {
3566 Py_DECREF(*(PyObject**)addr);
3567 return 1;
3568 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003569 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003570 output = arg;
3571 Py_INCREF(output);
3572 }
3573 else {
3574 arg = PyUnicode_FromObject(arg);
3575 if (!arg)
3576 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003577 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003578 Py_DECREF(arg);
3579 if (!output)
3580 return 0;
3581 if (!PyBytes_Check(output)) {
3582 Py_DECREF(output);
3583 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3584 return 0;
3585 }
3586 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003587 size = PyBytes_GET_SIZE(output);
3588 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003589 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003590 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003591 Py_DECREF(output);
3592 return 0;
3593 }
3594 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003595 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003596}
3597
3598
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003599int
3600PyUnicode_FSDecoder(PyObject* arg, void* addr)
3601{
3602 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003603 if (arg == NULL) {
3604 Py_DECREF(*(PyObject**)addr);
3605 return 1;
3606 }
3607 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003608 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003609 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003610 output = arg;
3611 Py_INCREF(output);
3612 }
3613 else {
3614 arg = PyBytes_FromObject(arg);
3615 if (!arg)
3616 return 0;
3617 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3618 PyBytes_GET_SIZE(arg));
3619 Py_DECREF(arg);
3620 if (!output)
3621 return 0;
3622 if (!PyUnicode_Check(output)) {
3623 Py_DECREF(output);
3624 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3625 return 0;
3626 }
3627 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003628 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003629 Py_DECREF(output);
3630 return 0;
3631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003632 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003633 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003634 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3635 Py_DECREF(output);
3636 return 0;
3637 }
3638 *(PyObject**)addr = output;
3639 return Py_CLEANUP_SUPPORTED;
3640}
3641
3642
Martin v. Löwis5b222132007-06-10 09:51:05 +00003643char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003644PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003645{
Christian Heimesf3863112007-11-22 07:46:41 +00003646 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003647
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003648 if (!PyUnicode_Check(unicode)) {
3649 PyErr_BadArgument();
3650 return NULL;
3651 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003652 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003653 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003654
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003655 if (PyUnicode_UTF8(unicode) == NULL) {
3656 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003657 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3658 if (bytes == NULL)
3659 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003660 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3661 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003662 Py_DECREF(bytes);
3663 return NULL;
3664 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003665 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3666 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3667 PyBytes_AS_STRING(bytes),
3668 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003669 Py_DECREF(bytes);
3670 }
3671
3672 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003673 *psize = PyUnicode_UTF8_LENGTH(unicode);
3674 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003675}
3676
3677char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003678PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003680 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3681}
3682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003683Py_UNICODE *
3684PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3685{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003686 const unsigned char *one_byte;
3687#if SIZEOF_WCHAR_T == 4
3688 const Py_UCS2 *two_bytes;
3689#else
3690 const Py_UCS4 *four_bytes;
3691 const Py_UCS4 *ucs4_end;
3692 Py_ssize_t num_surrogates;
3693#endif
3694 wchar_t *w;
3695 wchar_t *wchar_end;
3696
3697 if (!PyUnicode_Check(unicode)) {
3698 PyErr_BadArgument();
3699 return NULL;
3700 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003701 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003702 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003703 assert(_PyUnicode_KIND(unicode) != 0);
3704 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003705
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003706 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003707#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003708 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3709 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003710 num_surrogates = 0;
3711
3712 for (; four_bytes < ucs4_end; ++four_bytes) {
3713 if (*four_bytes > 0xFFFF)
3714 ++num_surrogates;
3715 }
3716
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003717 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3718 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3719 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003720 PyErr_NoMemory();
3721 return NULL;
3722 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003723 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003724
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003725 w = _PyUnicode_WSTR(unicode);
3726 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3727 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003728 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3729 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003730 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003731 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003732 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3733 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734 }
3735 else
3736 *w = *four_bytes;
3737
3738 if (w > wchar_end) {
3739 assert(0 && "Miscalculated string end");
3740 }
3741 }
3742 *w = 0;
3743#else
3744 /* sizeof(wchar_t) == 4 */
3745 Py_FatalError("Impossible unicode object state, wstr and str "
3746 "should share memory already.");
3747 return NULL;
3748#endif
3749 }
3750 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003751 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3752 (_PyUnicode_LENGTH(unicode) + 1));
3753 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003754 PyErr_NoMemory();
3755 return NULL;
3756 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003757 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3758 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3759 w = _PyUnicode_WSTR(unicode);
3760 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003762 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3763 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764 for (; w < wchar_end; ++one_byte, ++w)
3765 *w = *one_byte;
3766 /* null-terminate the wstr */
3767 *w = 0;
3768 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003769 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003770#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003771 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003772 for (; w < wchar_end; ++two_bytes, ++w)
3773 *w = *two_bytes;
3774 /* null-terminate the wstr */
3775 *w = 0;
3776#else
3777 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003778 PyObject_FREE(_PyUnicode_WSTR(unicode));
3779 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003780 Py_FatalError("Impossible unicode object state, wstr "
3781 "and str should share memory already.");
3782 return NULL;
3783#endif
3784 }
3785 else {
3786 assert(0 && "This should never happen.");
3787 }
3788 }
3789 }
3790 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003791 *size = PyUnicode_WSTR_LENGTH(unicode);
3792 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003793}
3794
Alexander Belopolsky40018472011-02-26 01:02:56 +00003795Py_UNICODE *
3796PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003798 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799}
3800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003801
Alexander Belopolsky40018472011-02-26 01:02:56 +00003802Py_ssize_t
3803PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804{
3805 if (!PyUnicode_Check(unicode)) {
3806 PyErr_BadArgument();
3807 goto onError;
3808 }
3809 return PyUnicode_GET_SIZE(unicode);
3810
Benjamin Peterson29060642009-01-31 22:14:21 +00003811 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 return -1;
3813}
3814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815Py_ssize_t
3816PyUnicode_GetLength(PyObject *unicode)
3817{
Victor Stinner07621332012-06-16 04:53:46 +02003818 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819 PyErr_BadArgument();
3820 return -1;
3821 }
Victor Stinner07621332012-06-16 04:53:46 +02003822 if (PyUnicode_READY(unicode) == -1)
3823 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824 return PyUnicode_GET_LENGTH(unicode);
3825}
3826
3827Py_UCS4
3828PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3829{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003830 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3831 PyErr_BadArgument();
3832 return (Py_UCS4)-1;
3833 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003834 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003835 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 return (Py_UCS4)-1;
3837 }
3838 return PyUnicode_READ_CHAR(unicode, index);
3839}
3840
3841int
3842PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3843{
3844 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003845 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003846 return -1;
3847 }
Victor Stinner488fa492011-12-12 00:01:39 +01003848 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003849 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003850 PyErr_SetString(PyExc_IndexError, "string index out of range");
3851 return -1;
3852 }
Victor Stinner488fa492011-12-12 00:01:39 +01003853 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003854 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003855 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3856 PyErr_SetString(PyExc_ValueError, "character out of range");
3857 return -1;
3858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3860 index, ch);
3861 return 0;
3862}
3863
Alexander Belopolsky40018472011-02-26 01:02:56 +00003864const char *
3865PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003866{
Victor Stinner42cb4622010-09-01 19:39:01 +00003867 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003868}
3869
Victor Stinner554f3f02010-06-16 23:33:54 +00003870/* create or adjust a UnicodeDecodeError */
3871static void
3872make_decode_exception(PyObject **exceptionObject,
3873 const char *encoding,
3874 const char *input, Py_ssize_t length,
3875 Py_ssize_t startpos, Py_ssize_t endpos,
3876 const char *reason)
3877{
3878 if (*exceptionObject == NULL) {
3879 *exceptionObject = PyUnicodeDecodeError_Create(
3880 encoding, input, length, startpos, endpos, reason);
3881 }
3882 else {
3883 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3884 goto onError;
3885 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3886 goto onError;
3887 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3888 goto onError;
3889 }
3890 return;
3891
3892onError:
3893 Py_DECREF(*exceptionObject);
3894 *exceptionObject = NULL;
3895}
3896
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003897#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003898/* error handling callback helper:
3899 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003900 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901 and adjust various state variables.
3902 return 0 on success, -1 on error
3903*/
3904
Alexander Belopolsky40018472011-02-26 01:02:56 +00003905static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003906unicode_decode_call_errorhandler_wchar(
3907 const char *errors, PyObject **errorHandler,
3908 const char *encoding, const char *reason,
3909 const char **input, const char **inend, Py_ssize_t *startinpos,
3910 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3911 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003913 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914
3915 PyObject *restuple = NULL;
3916 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003917 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003918 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003919 Py_ssize_t requiredsize;
3920 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003921 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003922 wchar_t *repwstr;
3923 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003925 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3926 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003927
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003929 *errorHandler = PyCodec_LookupError(errors);
3930 if (*errorHandler == NULL)
3931 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932 }
3933
Victor Stinner554f3f02010-06-16 23:33:54 +00003934 make_decode_exception(exceptionObject,
3935 encoding,
3936 *input, *inend - *input,
3937 *startinpos, *endinpos,
3938 reason);
3939 if (*exceptionObject == NULL)
3940 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003941
3942 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3943 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003944 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003946 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003947 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 }
3949 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003951
3952 /* Copy back the bytes variables, which might have been modified by the
3953 callback */
3954 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3955 if (!inputobj)
3956 goto onError;
3957 if (!PyBytes_Check(inputobj)) {
3958 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3959 }
3960 *input = PyBytes_AS_STRING(inputobj);
3961 insize = PyBytes_GET_SIZE(inputobj);
3962 *inend = *input + insize;
3963 /* we can DECREF safely, as the exception has another reference,
3964 so the object won't go away. */
3965 Py_DECREF(inputobj);
3966
3967 if (newpos<0)
3968 newpos = insize+newpos;
3969 if (newpos<0 || newpos>insize) {
3970 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3971 goto onError;
3972 }
3973
3974 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3975 if (repwstr == NULL)
3976 goto onError;
3977 /* need more space? (at least enough for what we
3978 have+the replacement+the rest of the string (starting
3979 at the new input position), so we won't have to check space
3980 when there are no errors in the rest of the string) */
3981 requiredsize = *outpos + repwlen + insize-newpos;
3982 if (requiredsize > outsize) {
3983 if (requiredsize < 2*outsize)
3984 requiredsize = 2*outsize;
3985 if (unicode_resize(output, requiredsize) < 0)
3986 goto onError;
3987 }
3988 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3989 *outpos += repwlen;
3990
3991 *endinpos = newpos;
3992 *inptr = *input + newpos;
3993
3994 /* we made it! */
3995 Py_XDECREF(restuple);
3996 return 0;
3997
3998 onError:
3999 Py_XDECREF(restuple);
4000 return -1;
4001}
4002#endif /* HAVE_MBCS */
4003
4004static int
4005unicode_decode_call_errorhandler_writer(
4006 const char *errors, PyObject **errorHandler,
4007 const char *encoding, const char *reason,
4008 const char **input, const char **inend, Py_ssize_t *startinpos,
4009 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4010 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4011{
4012 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4013
4014 PyObject *restuple = NULL;
4015 PyObject *repunicode = NULL;
4016 Py_ssize_t insize;
4017 Py_ssize_t newpos;
4018 PyObject *inputobj = NULL;
4019
4020 if (*errorHandler == NULL) {
4021 *errorHandler = PyCodec_LookupError(errors);
4022 if (*errorHandler == NULL)
4023 goto onError;
4024 }
4025
4026 make_decode_exception(exceptionObject,
4027 encoding,
4028 *input, *inend - *input,
4029 *startinpos, *endinpos,
4030 reason);
4031 if (*exceptionObject == NULL)
4032 goto onError;
4033
4034 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4035 if (restuple == NULL)
4036 goto onError;
4037 if (!PyTuple_Check(restuple)) {
4038 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4039 goto onError;
4040 }
4041 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004042 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004043
4044 /* Copy back the bytes variables, which might have been modified by the
4045 callback */
4046 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4047 if (!inputobj)
4048 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004049 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004050 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004051 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004052 *input = PyBytes_AS_STRING(inputobj);
4053 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004054 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004055 /* we can DECREF safely, as the exception has another reference,
4056 so the object won't go away. */
4057 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004058
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004061 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4063 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004064 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004066 writer->overallocate = 1;
4067 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4068 return
4069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004071 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004072
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004074 Py_XDECREF(restuple);
4075 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076
Benjamin Peterson29060642009-01-31 22:14:21 +00004077 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004079 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080}
4081
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004082/* --- UTF-7 Codec -------------------------------------------------------- */
4083
Antoine Pitrou244651a2009-05-04 18:56:13 +00004084/* See RFC2152 for details. We encode conservatively and decode liberally. */
4085
4086/* Three simple macros defining base-64. */
4087
4088/* Is c a base-64 character? */
4089
4090#define IS_BASE64(c) \
4091 (((c) >= 'A' && (c) <= 'Z') || \
4092 ((c) >= 'a' && (c) <= 'z') || \
4093 ((c) >= '0' && (c) <= '9') || \
4094 (c) == '+' || (c) == '/')
4095
4096/* given that c is a base-64 character, what is its base-64 value? */
4097
4098#define FROM_BASE64(c) \
4099 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4100 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4101 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4102 (c) == '+' ? 62 : 63)
4103
4104/* What is the base-64 character of the bottom 6 bits of n? */
4105
4106#define TO_BASE64(n) \
4107 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4108
4109/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4110 * decoded as itself. We are permissive on decoding; the only ASCII
4111 * byte not decoding to itself is the + which begins a base64
4112 * string. */
4113
4114#define DECODE_DIRECT(c) \
4115 ((c) <= 127 && (c) != '+')
4116
4117/* The UTF-7 encoder treats ASCII characters differently according to
4118 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4119 * the above). See RFC2152. This array identifies these different
4120 * sets:
4121 * 0 : "Set D"
4122 * alphanumeric and '(),-./:?
4123 * 1 : "Set O"
4124 * !"#$%&*;<=>@[]^_`{|}
4125 * 2 : "whitespace"
4126 * ht nl cr sp
4127 * 3 : special (must be base64 encoded)
4128 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4129 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004130
Tim Petersced69f82003-09-16 20:30:58 +00004131static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004132char utf7_category[128] = {
4133/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4134 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4135/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4136 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4137/* sp ! " # $ % & ' ( ) * + , - . / */
4138 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4139/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4141/* @ A B C D E F G H I J K L M N O */
4142 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4143/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4145/* ` a b c d e f g h i j k l m n o */
4146 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4147/* p q r s t u v w x y z { | } ~ del */
4148 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004149};
4150
Antoine Pitrou244651a2009-05-04 18:56:13 +00004151/* ENCODE_DIRECT: this character should be encoded as itself. The
4152 * answer depends on whether we are encoding set O as itself, and also
4153 * on whether we are encoding whitespace as itself. RFC2152 makes it
4154 * clear that the answers to these questions vary between
4155 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004156
Antoine Pitrou244651a2009-05-04 18:56:13 +00004157#define ENCODE_DIRECT(c, directO, directWS) \
4158 ((c) < 128 && (c) > 0 && \
4159 ((utf7_category[(c)] == 0) || \
4160 (directWS && (utf7_category[(c)] == 2)) || \
4161 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004162
Alexander Belopolsky40018472011-02-26 01:02:56 +00004163PyObject *
4164PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004165 Py_ssize_t size,
4166 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004167{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004168 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4169}
4170
Antoine Pitrou244651a2009-05-04 18:56:13 +00004171/* The decoder. The only state we preserve is our read position,
4172 * i.e. how many characters we have consumed. So if we end in the
4173 * middle of a shift sequence we have to back off the read position
4174 * and the output to the beginning of the sequence, otherwise we lose
4175 * all the shift state (seen bits, number of bits seen, high
4176 * surrogate). */
4177
Alexander Belopolsky40018472011-02-26 01:02:56 +00004178PyObject *
4179PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004180 Py_ssize_t size,
4181 const char *errors,
4182 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004183{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004185 Py_ssize_t startinpos;
4186 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004187 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004189 const char *errmsg = "";
4190 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004191 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004192 unsigned int base64bits = 0;
4193 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004194 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 PyObject *errorHandler = NULL;
4196 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004197
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004198 if (size == 0) {
4199 if (consumed)
4200 *consumed = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004201 Py_INCREF(unicode_empty);
4202 return unicode_empty;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004203 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004204
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004205 /* Start off assuming it's all ASCII. Widen later as necessary. */
4206 _PyUnicodeWriter_Init(&writer, 0);
4207 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4208 goto onError;
4209
4210 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004211 e = s + size;
4212
4213 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004214 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004215 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004216 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004217
Antoine Pitrou244651a2009-05-04 18:56:13 +00004218 if (inShift) { /* in a base-64 section */
4219 if (IS_BASE64(ch)) { /* consume a base-64 character */
4220 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4221 base64bits += 6;
4222 s++;
4223 if (base64bits >= 16) {
4224 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004225 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004226 base64bits -= 16;
4227 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4228 if (surrogate) {
4229 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004230 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4231 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004232 if (_PyUnicodeWriter_Prepare(&writer, 1, ch2) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004233 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004234 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch2);
4235 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004236 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004237 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004238 }
4239 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004240 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004241 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004242 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4243 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004244 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004245 }
4246 }
Victor Stinner551ac952011-11-29 22:58:13 +01004247 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004248 /* first surrogate */
4249 surrogate = outCh;
4250 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004251 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004252 if (_PyUnicodeWriter_Prepare(&writer, 1, outCh) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004253 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004254 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, outCh);
4255 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004256 }
4257 }
4258 }
4259 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004260 inShift = 0;
4261 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004262 if (surrogate) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004263 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004264 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004265 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4266 writer.pos++;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004267 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004269 if (base64bits > 0) { /* left-over bits */
4270 if (base64bits >= 6) {
4271 /* We've seen at least one base-64 character */
4272 errmsg = "partial character in shift sequence";
4273 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004274 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004275 else {
4276 /* Some bits remain; they should be zero */
4277 if (base64buffer != 0) {
4278 errmsg = "non-zero padding bits in shift sequence";
4279 goto utf7Error;
4280 }
4281 }
4282 }
4283 if (ch != '-') {
4284 /* '-' is absorbed; other terminating
4285 characters are preserved */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004286 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004287 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004288 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4289 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004290 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004291 }
4292 }
4293 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295 s++; /* consume '+' */
4296 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004297 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298 if (_PyUnicodeWriter_Prepare(&writer, 1, '+') == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004299 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004300 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '+');
4301 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004302 }
4303 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004304 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004305 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004306 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004307 }
4308 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004310 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004311 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
4312 goto onError;
4313 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4314 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004315 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004316 else {
4317 startinpos = s-starts;
4318 s++;
4319 errmsg = "unexpected special character";
4320 goto utf7Error;
4321 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004323utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004324 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004325 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004326 errors, &errorHandler,
4327 "utf7", errmsg,
4328 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004329 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004330 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004331 }
4332
Antoine Pitrou244651a2009-05-04 18:56:13 +00004333 /* end of string */
4334
4335 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4336 /* if we're in an inconsistent state, that's an error */
4337 if (surrogate ||
4338 (base64bits >= 6) ||
4339 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004340 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004341 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004342 errors, &errorHandler,
4343 "utf7", "unterminated shift sequence",
4344 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004345 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004346 goto onError;
4347 if (s < e)
4348 goto restart;
4349 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004350 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351
4352 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004353 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004355 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004356 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004357 }
4358 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004359 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004361 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004362
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363 Py_XDECREF(errorHandler);
4364 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004365 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004366
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368 Py_XDECREF(errorHandler);
4369 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004370 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004371 return NULL;
4372}
4373
4374
Alexander Belopolsky40018472011-02-26 01:02:56 +00004375PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004376_PyUnicode_EncodeUTF7(PyObject *str,
4377 int base64SetO,
4378 int base64WhiteSpace,
4379 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004380{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004381 int kind;
4382 void *data;
4383 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004384 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004385 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004386 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004387 unsigned int base64bits = 0;
4388 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 char * out;
4390 char * start;
4391
Benjamin Petersonbac79492012-01-14 13:34:47 -05004392 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004393 return NULL;
4394 kind = PyUnicode_KIND(str);
4395 data = PyUnicode_DATA(str);
4396 len = PyUnicode_GET_LENGTH(str);
4397
4398 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004400
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004401 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004402 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004403 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004404 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 if (v == NULL)
4406 return NULL;
4407
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004408 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004409 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004410 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 if (inShift) {
4413 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4414 /* shifting out */
4415 if (base64bits) { /* output remaining bits */
4416 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4417 base64buffer = 0;
4418 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 }
4420 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 /* Characters not in the BASE64 set implicitly unshift the sequence
4422 so no '-' is required, except if the character is itself a '-' */
4423 if (IS_BASE64(ch) || ch == '-') {
4424 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004425 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 *out++ = (char) ch;
4427 }
4428 else {
4429 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004430 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004431 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004432 else { /* not in a shift sequence */
4433 if (ch == '+') {
4434 *out++ = '+';
4435 *out++ = '-';
4436 }
4437 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4438 *out++ = (char) ch;
4439 }
4440 else {
4441 *out++ = '+';
4442 inShift = 1;
4443 goto encode_char;
4444 }
4445 }
4446 continue;
4447encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004449 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004450
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 /* code first surrogate */
4452 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004453 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004454 while (base64bits >= 6) {
4455 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4456 base64bits -= 6;
4457 }
4458 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004459 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 base64bits += 16;
4462 base64buffer = (base64buffer << 16) | ch;
4463 while (base64bits >= 6) {
4464 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4465 base64bits -= 6;
4466 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004467 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004468 if (base64bits)
4469 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4470 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004472 if (_PyBytes_Resize(&v, out - start) < 0)
4473 return NULL;
4474 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004476PyObject *
4477PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4478 Py_ssize_t size,
4479 int base64SetO,
4480 int base64WhiteSpace,
4481 const char *errors)
4482{
4483 PyObject *result;
4484 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4485 if (tmp == NULL)
4486 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004487 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004488 base64WhiteSpace, errors);
4489 Py_DECREF(tmp);
4490 return result;
4491}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493#undef IS_BASE64
4494#undef FROM_BASE64
4495#undef TO_BASE64
4496#undef DECODE_DIRECT
4497#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499/* --- UTF-8 Codec -------------------------------------------------------- */
4500
Alexander Belopolsky40018472011-02-26 01:02:56 +00004501PyObject *
4502PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004503 Py_ssize_t size,
4504 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505{
Walter Dörwald69652032004-09-07 20:24:22 +00004506 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4507}
4508
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004509#include "stringlib/asciilib.h"
4510#include "stringlib/codecs.h"
4511#include "stringlib/undef.h"
4512
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004513#include "stringlib/ucs1lib.h"
4514#include "stringlib/codecs.h"
4515#include "stringlib/undef.h"
4516
4517#include "stringlib/ucs2lib.h"
4518#include "stringlib/codecs.h"
4519#include "stringlib/undef.h"
4520
4521#include "stringlib/ucs4lib.h"
4522#include "stringlib/codecs.h"
4523#include "stringlib/undef.h"
4524
Antoine Pitrouab868312009-01-10 15:40:25 +00004525/* Mask to quickly check whether a C 'long' contains a
4526 non-ASCII, UTF8-encoded char. */
4527#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004528# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004529#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004530# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004531#else
4532# error C 'long' size should be either 4 or 8!
4533#endif
4534
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004535static Py_ssize_t
4536ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004537{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004538 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004539 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004540
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004541#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004542 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4543 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004544 /* Fast path, see in STRINGLIB(utf8_decode) for
4545 an explanation. */
4546 /* Help register allocation */
4547 register const char *_p = p;
4548 register Py_UCS1 * q = dest;
4549 while (_p < aligned_end) {
4550 unsigned long value = *(const unsigned long *) _p;
4551 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004553 *((unsigned long *)q) = value;
4554 _p += SIZEOF_LONG;
4555 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004556 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004557 p = _p;
4558 while (p < end) {
4559 if ((unsigned char)*p & 0x80)
4560 break;
4561 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004563 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004565#endif
4566 while (p < end) {
4567 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4568 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004569 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004570 /* Help register allocation */
4571 register const char *_p = p;
4572 while (_p < aligned_end) {
4573 unsigned long value = *(unsigned long *) _p;
4574 if (value & ASCII_CHAR_MASK)
4575 break;
4576 _p += SIZEOF_LONG;
4577 }
4578 p = _p;
4579 if (_p == end)
4580 break;
4581 }
4582 if ((unsigned char)*p & 0x80)
4583 break;
4584 ++p;
4585 }
4586 memcpy(dest, start, p - start);
4587 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588}
Antoine Pitrouab868312009-01-10 15:40:25 +00004589
Victor Stinner785938e2011-12-11 20:09:03 +01004590PyObject *
4591PyUnicode_DecodeUTF8Stateful(const char *s,
4592 Py_ssize_t size,
4593 const char *errors,
4594 Py_ssize_t *consumed)
4595{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004596 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004597 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004598 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004599
4600 Py_ssize_t startinpos;
4601 Py_ssize_t endinpos;
4602 const char *errmsg = "";
4603 PyObject *errorHandler = NULL;
4604 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004605
4606 if (size == 0) {
4607 if (consumed)
4608 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004609 Py_INCREF(unicode_empty);
4610 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004611 }
4612
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004613 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4614 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004615 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004616 *consumed = 1;
4617 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004618 }
4619
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004620 _PyUnicodeWriter_Init(&writer, 0);
4621 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4622 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004623
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004624 writer.pos = ascii_decode(s, end, writer.data);
4625 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004626 while (s < end) {
4627 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004628 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004629 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004630 if (PyUnicode_IS_ASCII(writer.buffer))
4631 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004632 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004633 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004634 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004635 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004636 } else {
4637 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004638 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004639 }
4640
4641 switch (ch) {
4642 case 0:
4643 if (s == end || consumed)
4644 goto End;
4645 errmsg = "unexpected end of data";
4646 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004647 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648 break;
4649 case 1:
4650 errmsg = "invalid start byte";
4651 startinpos = s - starts;
4652 endinpos = startinpos + 1;
4653 break;
4654 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004655 case 3:
4656 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004657 errmsg = "invalid continuation byte";
4658 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004659 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004660 break;
4661 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004662 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004663 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004664 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4665 writer.pos++;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004666 continue;
4667 }
4668
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004669 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004670 errors, &errorHandler,
4671 "utf-8", errmsg,
4672 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004675 }
4676
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004678 if (consumed)
4679 *consumed = s - starts;
4680
4681 Py_XDECREF(errorHandler);
4682 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004683 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004684
4685onError:
4686 Py_XDECREF(errorHandler);
4687 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004688 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004689 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004690}
4691
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004692#ifdef __APPLE__
4693
4694/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004695 used to decode the command line arguments on Mac OS X.
4696
4697 Return a pointer to a newly allocated wide character string (use
4698 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004699
4700wchar_t*
4701_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4702{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004703 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004704 wchar_t *unicode;
4705 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004706
4707 /* Note: size will always be longer than the resulting Unicode
4708 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004709 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004710 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004711 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4712 if (!unicode)
4713 return NULL;
4714
4715 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004716 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004717 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004718 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004719 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004720#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004722#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004723 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004724#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004725 if (ch > 0xFF) {
4726#if SIZEOF_WCHAR_T == 4
4727 assert(0);
4728#else
4729 assert(Py_UNICODE_IS_SURROGATE(ch));
4730 /* compute and append the two surrogates: */
4731 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4732 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4733#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004734 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004735 else {
4736 if (!ch && s == e)
4737 break;
4738 /* surrogateescape */
4739 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4740 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004741 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004742 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004743 return unicode;
4744}
4745
4746#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004748/* Primary internal function which creates utf8 encoded bytes objects.
4749
4750 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004751 and allocate exactly as much space needed at the end. Else allocate the
4752 maximum possible needed (4 result bytes per Unicode character), and return
4753 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004754*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004755PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004756_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757{
Victor Stinner6099a032011-12-18 14:22:26 +01004758 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004759 void *data;
4760 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004762 if (!PyUnicode_Check(unicode)) {
4763 PyErr_BadArgument();
4764 return NULL;
4765 }
4766
4767 if (PyUnicode_READY(unicode) == -1)
4768 return NULL;
4769
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004770 if (PyUnicode_UTF8(unicode))
4771 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4772 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004773
4774 kind = PyUnicode_KIND(unicode);
4775 data = PyUnicode_DATA(unicode);
4776 size = PyUnicode_GET_LENGTH(unicode);
4777
Benjamin Petersonead6b532011-12-20 17:23:42 -06004778 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004779 default:
4780 assert(0);
4781 case PyUnicode_1BYTE_KIND:
4782 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4783 assert(!PyUnicode_IS_ASCII(unicode));
4784 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4785 case PyUnicode_2BYTE_KIND:
4786 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4787 case PyUnicode_4BYTE_KIND:
4788 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790}
4791
Alexander Belopolsky40018472011-02-26 01:02:56 +00004792PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004793PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4794 Py_ssize_t size,
4795 const char *errors)
4796{
4797 PyObject *v, *unicode;
4798
4799 unicode = PyUnicode_FromUnicode(s, size);
4800 if (unicode == NULL)
4801 return NULL;
4802 v = _PyUnicode_AsUTF8String(unicode, errors);
4803 Py_DECREF(unicode);
4804 return v;
4805}
4806
4807PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004808PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004810 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811}
4812
Walter Dörwald41980ca2007-08-16 21:55:45 +00004813/* --- UTF-32 Codec ------------------------------------------------------- */
4814
4815PyObject *
4816PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004817 Py_ssize_t size,
4818 const char *errors,
4819 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004820{
4821 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4822}
4823
4824PyObject *
4825PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004826 Py_ssize_t size,
4827 const char *errors,
4828 int *byteorder,
4829 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004830{
4831 const char *starts = s;
4832 Py_ssize_t startinpos;
4833 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004834 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004835 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004836 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004837 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004838 PyObject *errorHandler = NULL;
4839 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004840
Walter Dörwald41980ca2007-08-16 21:55:45 +00004841 q = (unsigned char *)s;
4842 e = q + size;
4843
4844 if (byteorder)
4845 bo = *byteorder;
4846
4847 /* Check for BOM marks (U+FEFF) in the input and adjust current
4848 byte order setting accordingly. In native mode, the leading BOM
4849 mark is skipped, in all other modes, it is copied to the output
4850 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004851 if (bo == 0 && size >= 4) {
4852 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4853 if (bom == 0x0000FEFF) {
4854 bo = -1;
4855 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004856 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004857 else if (bom == 0xFFFE0000) {
4858 bo = 1;
4859 q += 4;
4860 }
4861 if (byteorder)
4862 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004863 }
4864
Victor Stinnere64322e2012-10-30 23:12:47 +01004865 if (q == e) {
4866 if (consumed)
4867 *consumed = size;
4868 Py_INCREF(unicode_empty);
4869 return unicode_empty;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004870 }
4871
Victor Stinnere64322e2012-10-30 23:12:47 +01004872#ifdef WORDS_BIGENDIAN
4873 le = bo < 0;
4874#else
4875 le = bo <= 0;
4876#endif
4877
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004878 _PyUnicodeWriter_Init(&writer, 0);
4879 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
4880 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004881
Victor Stinnere64322e2012-10-30 23:12:47 +01004882 while (1) {
4883 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004884 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004885
Victor Stinnere64322e2012-10-30 23:12:47 +01004886 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004887 enum PyUnicode_Kind kind = writer.kind;
4888 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004889 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004890 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004891 if (le) {
4892 do {
4893 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4894 if (ch > maxch)
4895 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004896 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004897 q += 4;
4898 } while (q <= last);
4899 }
4900 else {
4901 do {
4902 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4903 if (ch > maxch)
4904 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004905 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004906 q += 4;
4907 } while (q <= last);
4908 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004909 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004910 }
4911
4912 if (ch <= maxch) {
4913 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01004915 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01004917 startinpos = ((const char *)q) - starts;
4918 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00004919 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004920 else {
4921 if (ch < 0x110000) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004922 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinnere64322e2012-10-30 23:12:47 +01004923 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004924 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4925 writer.pos++;
Victor Stinnere64322e2012-10-30 23:12:47 +01004926 q += 4;
4927 continue;
4928 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01004930 startinpos = ((const char *)q) - starts;
4931 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004932 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004933
4934 /* The remaining input chars are ignored if the callback
4935 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004936 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 errors, &errorHandler,
4938 "utf32", errmsg,
4939 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004940 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004942 }
4943
Walter Dörwald41980ca2007-08-16 21:55:45 +00004944 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004945 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004946
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947 Py_XDECREF(errorHandler);
4948 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004949 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004952 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004953 Py_XDECREF(errorHandler);
4954 Py_XDECREF(exc);
4955 return NULL;
4956}
4957
4958PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004959_PyUnicode_EncodeUTF32(PyObject *str,
4960 const char *errors,
4961 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004963 int kind;
4964 void *data;
4965 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004966 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004967 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004968 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004969 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02004970#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00004971 int iorder[] = {0, 1, 2, 3};
4972#else
4973 int iorder[] = {3, 2, 1, 0};
4974#endif
4975
Benjamin Peterson29060642009-01-31 22:14:21 +00004976#define STORECHAR(CH) \
4977 do { \
4978 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4979 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4980 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4981 p[iorder[0]] = (CH) & 0xff; \
4982 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004983 } while(0)
4984
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004985 if (!PyUnicode_Check(str)) {
4986 PyErr_BadArgument();
4987 return NULL;
4988 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004989 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004990 return NULL;
4991 kind = PyUnicode_KIND(str);
4992 data = PyUnicode_DATA(str);
4993 len = PyUnicode_GET_LENGTH(str);
4994
4995 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004996 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00004997 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004998 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004999 if (v == NULL)
5000 return NULL;
5001
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005002 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005003 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005005 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005006 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005007
5008 if (byteorder == -1) {
5009 /* force LE */
5010 iorder[0] = 0;
5011 iorder[1] = 1;
5012 iorder[2] = 2;
5013 iorder[3] = 3;
5014 }
5015 else if (byteorder == 1) {
5016 /* force BE */
5017 iorder[0] = 3;
5018 iorder[1] = 2;
5019 iorder[2] = 1;
5020 iorder[3] = 0;
5021 }
5022
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005023 for (i = 0; i < len; i++)
5024 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005025
5026 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005027 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005028#undef STORECHAR
5029}
5030
Alexander Belopolsky40018472011-02-26 01:02:56 +00005031PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005032PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5033 Py_ssize_t size,
5034 const char *errors,
5035 int byteorder)
5036{
5037 PyObject *result;
5038 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5039 if (tmp == NULL)
5040 return NULL;
5041 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5042 Py_DECREF(tmp);
5043 return result;
5044}
5045
5046PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005047PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005048{
Victor Stinnerb960b342011-11-20 19:12:52 +01005049 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005050}
5051
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052/* --- UTF-16 Codec ------------------------------------------------------- */
5053
Tim Peters772747b2001-08-09 22:21:55 +00005054PyObject *
5055PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 Py_ssize_t size,
5057 const char *errors,
5058 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059{
Walter Dörwald69652032004-09-07 20:24:22 +00005060 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5061}
5062
5063PyObject *
5064PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 Py_ssize_t size,
5066 const char *errors,
5067 int *byteorder,
5068 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005069{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005070 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005071 Py_ssize_t startinpos;
5072 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005073 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005074 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005075 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005076 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005077 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005078 PyObject *errorHandler = NULL;
5079 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080
Tim Peters772747b2001-08-09 22:21:55 +00005081 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005082 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083
5084 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005085 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005087 /* Check for BOM marks (U+FEFF) in the input and adjust current
5088 byte order setting accordingly. In native mode, the leading BOM
5089 mark is skipped, in all other modes, it is copied to the output
5090 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005091 if (bo == 0 && size >= 2) {
5092 const Py_UCS4 bom = (q[1] << 8) | q[0];
5093 if (bom == 0xFEFF) {
5094 q += 2;
5095 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005096 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005097 else if (bom == 0xFFFE) {
5098 q += 2;
5099 bo = 1;
5100 }
5101 if (byteorder)
5102 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104
Antoine Pitrou63065d72012-05-15 23:48:04 +02005105 if (q == e) {
5106 if (consumed)
5107 *consumed = size;
5108 Py_INCREF(unicode_empty);
5109 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005110 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005111
Christian Heimes743e0cd2012-10-17 23:52:17 +02005112#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005113 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005114#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005115 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005116#endif
Tim Peters772747b2001-08-09 22:21:55 +00005117
Antoine Pitrou63065d72012-05-15 23:48:04 +02005118 /* Note: size will always be longer than the resulting Unicode
5119 character count */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005120 _PyUnicodeWriter_Init(&writer, 0);
5121 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
5122 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005123
Antoine Pitrou63065d72012-05-15 23:48:04 +02005124 while (1) {
5125 Py_UCS4 ch = 0;
5126 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005127 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005128 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005129 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005130 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005131 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005132 native_ordering);
5133 else
5134 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005135 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005136 native_ordering);
5137 } else if (kind == PyUnicode_2BYTE_KIND) {
5138 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005139 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005140 native_ordering);
5141 } else {
5142 assert(kind == PyUnicode_4BYTE_KIND);
5143 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005144 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005145 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005146 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005147 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005148
Antoine Pitrou63065d72012-05-15 23:48:04 +02005149 switch (ch)
5150 {
5151 case 0:
5152 /* remaining byte at the end? (size should be even) */
5153 if (q == e || consumed)
5154 goto End;
5155 errmsg = "truncated data";
5156 startinpos = ((const char *)q) - starts;
5157 endinpos = ((const char *)e) - starts;
5158 break;
5159 /* The remaining input chars are ignored if the callback
5160 chooses to skip the input */
5161 case 1:
5162 errmsg = "unexpected end of data";
5163 startinpos = ((const char *)q) - 2 - starts;
5164 endinpos = ((const char *)e) - starts;
5165 break;
5166 case 2:
5167 errmsg = "illegal encoding";
5168 startinpos = ((const char *)q) - 2 - starts;
5169 endinpos = startinpos + 2;
5170 break;
5171 case 3:
5172 errmsg = "illegal UTF-16 surrogate";
5173 startinpos = ((const char *)q) - 4 - starts;
5174 endinpos = startinpos + 2;
5175 break;
5176 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005177 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005178 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005179 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
5180 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 continue;
5182 }
5183
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005184 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005185 errors,
5186 &errorHandler,
5187 "utf16", errmsg,
5188 &starts,
5189 (const char **)&e,
5190 &startinpos,
5191 &endinpos,
5192 &exc,
5193 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005194 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 }
5197
Antoine Pitrou63065d72012-05-15 23:48:04 +02005198End:
Walter Dörwald69652032004-09-07 20:24:22 +00005199 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005201
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005202 Py_XDECREF(errorHandler);
5203 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005204 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005207 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005208 Py_XDECREF(errorHandler);
5209 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 return NULL;
5211}
5212
Tim Peters772747b2001-08-09 22:21:55 +00005213PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005214_PyUnicode_EncodeUTF16(PyObject *str,
5215 const char *errors,
5216 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005218 enum PyUnicode_Kind kind;
5219 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005220 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005221 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005222 unsigned short *out;
5223 Py_ssize_t bytesize;
5224 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005225#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005226 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005227#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005228 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005229#endif
5230
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005231 if (!PyUnicode_Check(str)) {
5232 PyErr_BadArgument();
5233 return NULL;
5234 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005235 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005236 return NULL;
5237 kind = PyUnicode_KIND(str);
5238 data = PyUnicode_DATA(str);
5239 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005240
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005241 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005242 if (kind == PyUnicode_4BYTE_KIND) {
5243 const Py_UCS4 *in = (const Py_UCS4 *)data;
5244 const Py_UCS4 *end = in + len;
5245 while (in < end)
5246 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005247 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005248 }
5249 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005251 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005252 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 if (v == NULL)
5254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005256 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005257 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005258 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005260 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005261 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005262 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005263
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005264 switch (kind) {
5265 case PyUnicode_1BYTE_KIND: {
5266 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5267 break;
Tim Peters772747b2001-08-09 22:21:55 +00005268 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005269 case PyUnicode_2BYTE_KIND: {
5270 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5271 break;
Tim Peters772747b2001-08-09 22:21:55 +00005272 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005273 case PyUnicode_4BYTE_KIND: {
5274 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5275 break;
5276 }
5277 default:
5278 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005279 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005280
5281 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005282 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283}
5284
Alexander Belopolsky40018472011-02-26 01:02:56 +00005285PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005286PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5287 Py_ssize_t size,
5288 const char *errors,
5289 int byteorder)
5290{
5291 PyObject *result;
5292 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5293 if (tmp == NULL)
5294 return NULL;
5295 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5296 Py_DECREF(tmp);
5297 return result;
5298}
5299
5300PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005301PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005303 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304}
5305
5306/* --- Unicode Escape Codec ----------------------------------------------- */
5307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005308/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5309 if all the escapes in the string make it still a valid ASCII string.
5310 Returns -1 if any escapes were found which cause the string to
5311 pop out of ASCII range. Otherwise returns the length of the
5312 required buffer to hold the string.
5313 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005314static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005315length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5316{
5317 const unsigned char *p = (const unsigned char *)s;
5318 const unsigned char *end = p + size;
5319 Py_ssize_t length = 0;
5320
5321 if (size < 0)
5322 return -1;
5323
5324 for (; p < end; ++p) {
5325 if (*p > 127) {
5326 /* Non-ASCII */
5327 return -1;
5328 }
5329 else if (*p != '\\') {
5330 /* Normal character */
5331 ++length;
5332 }
5333 else {
5334 /* Backslash-escape, check next char */
5335 ++p;
5336 /* Escape sequence reaches till end of string or
5337 non-ASCII follow-up. */
5338 if (p >= end || *p > 127)
5339 return -1;
5340 switch (*p) {
5341 case '\n':
5342 /* backslash + \n result in zero characters */
5343 break;
5344 case '\\': case '\'': case '\"':
5345 case 'b': case 'f': case 't':
5346 case 'n': case 'r': case 'v': case 'a':
5347 ++length;
5348 break;
5349 case '0': case '1': case '2': case '3':
5350 case '4': case '5': case '6': case '7':
5351 case 'x': case 'u': case 'U': case 'N':
5352 /* these do not guarantee ASCII characters */
5353 return -1;
5354 default:
5355 /* count the backslash + the other character */
5356 length += 2;
5357 }
5358 }
5359 }
5360 return length;
5361}
5362
Fredrik Lundh06d12682001-01-24 07:59:11 +00005363static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005364
Alexander Belopolsky40018472011-02-26 01:02:56 +00005365PyObject *
5366PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005367 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005368 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005371 Py_ssize_t startinpos;
5372 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005373 int j;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005374 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005376 char* message;
5377 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005378 PyObject *errorHandler = NULL;
5379 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005380 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005381
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005382 len = length_of_escaped_ascii_string(s, size);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005383 if (len == 0) {
5384 Py_INCREF(unicode_empty);
5385 return unicode_empty;
5386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005387
5388 /* After length_of_escaped_ascii_string() there are two alternatives,
5389 either the string is pure ASCII with named escapes like \n, etc.
5390 and we determined it's exact size (common case)
5391 or it contains \x, \u, ... escape sequences. then we create a
5392 legacy wchar string and resize it at the end of this function. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005393 _PyUnicodeWriter_Init(&writer, 0);
5394 if (len > 0) {
5395 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005396 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005397 assert(writer.kind == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005398 }
5399 else {
5400 /* Escaped strings will always be longer than the resulting
5401 Unicode string, so we start with size here and then reduce the
5402 length after conversion to the true value.
5403 (but if the error callback returns a long replacement string
5404 we'll have to allocate more space) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005405 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005406 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005407 }
5408
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005410 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005412
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 while (s < end) {
5414 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005415 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005416 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417
5418 /* Non-escape characters are interpreted as Unicode ordinals */
5419 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005420 x = (unsigned char)*s;
5421 s++;
5422 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005423 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005424 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5425 writer.pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 continue;
5427 }
5428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005429 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 /* \ - Escapes */
5431 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005432 c = *s++;
5433 if (s > end)
5434 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005435
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005436 /* The only case in which i == ascii_length is a backslash
5437 followed by a newline. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005438 assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005440 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441
Benjamin Peterson29060642009-01-31 22:14:21 +00005442 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005443#define WRITECHAR(ch) \
5444 do { \
5445 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) \
5446 goto onError; \
5447 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); \
5448 writer.pos++; \
5449 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005450
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005452 case '\\': WRITECHAR('\\'); break;
5453 case '\'': WRITECHAR('\''); break;
5454 case '\"': WRITECHAR('\"'); break;
5455 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005456 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005457 case 'f': WRITECHAR('\014'); break;
5458 case 't': WRITECHAR('\t'); break;
5459 case 'n': WRITECHAR('\n'); break;
5460 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005461 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005462 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005463 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005464 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465
Benjamin Peterson29060642009-01-31 22:14:21 +00005466 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 case '0': case '1': case '2': case '3':
5468 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005469 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005470 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005471 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005472 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005473 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005475 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 break;
5477
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 /* hex escapes */
5479 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005481 digits = 2;
5482 message = "truncated \\xXX escape";
5483 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005487 digits = 4;
5488 message = "truncated \\uXXXX escape";
5489 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005492 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005493 digits = 8;
5494 message = "truncated \\UXXXXXXXX escape";
5495 hexescape:
5496 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005497 if (s+digits>end) {
5498 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005499 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 errors, &errorHandler,
5501 "unicodeescape", "end of string in escape sequence",
5502 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005503 &writer))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005504 goto onError;
5505 goto nextByte;
5506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005507 for (j = 0; j < digits; ++j) {
5508 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005509 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005510 endinpos = (s+j+1)-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005511 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 errors, &errorHandler,
5513 "unicodeescape", message,
5514 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005515 &writer))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005516 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005517 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005518 }
5519 chr = (chr<<4) & ~0xF;
5520 if (c >= '0' && c <= '9')
5521 chr += c - '0';
5522 else if (c >= 'a' && c <= 'f')
5523 chr += 10 + c - 'a';
5524 else
5525 chr += 10 + c - 'A';
5526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005527 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005528 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005529 /* _decoding_error will have already written into the
5530 target buffer. */
5531 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005532 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005533 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005534 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005535 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005536 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005537 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005538 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 errors, &errorHandler,
5540 "unicodeescape", "illegal Unicode character",
5541 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005542 &writer))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005543 goto onError;
5544 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005545 break;
5546
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005548 case 'N':
5549 message = "malformed \\N character escape";
5550 if (ucnhash_CAPI == NULL) {
5551 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005552 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5553 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005554 if (ucnhash_CAPI == NULL)
5555 goto ucnhashError;
5556 }
5557 if (*s == '{') {
5558 const char *start = s+1;
5559 /* look for the closing brace */
5560 while (*s != '}' && s < end)
5561 s++;
5562 if (s > start && s < end && *s == '}') {
5563 /* found a name. look it up in the unicode database */
5564 message = "unknown Unicode character name";
5565 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005566 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005567 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005568 goto store;
5569 }
5570 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005571 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005572 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 errors, &errorHandler,
5574 "unicodeescape", message,
5575 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005576 &writer))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005577 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005578 break;
5579
5580 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005581 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005582 message = "\\ at end of string";
5583 s--;
5584 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005585 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 errors, &errorHandler,
5587 "unicodeescape", message,
5588 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005589 &writer))
Walter Dörwald8c077222002-03-25 11:16:18 +00005590 goto onError;
5591 }
5592 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005593 WRITECHAR('\\');
5594 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005595 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005596 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005599 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005601#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005602
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005603 Py_XDECREF(errorHandler);
5604 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005605 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005606
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005608 PyErr_SetString(
5609 PyExc_UnicodeError,
5610 "\\N escapes not supported (can't load unicodedata module)"
5611 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005612 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005613 Py_XDECREF(errorHandler);
5614 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005615 return NULL;
5616
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005618 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 Py_XDECREF(errorHandler);
5620 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 return NULL;
5622}
5623
5624/* Return a Unicode-Escape string version of the Unicode object.
5625
5626 If quotes is true, the string is enclosed in u"" or u'' quotes as
5627 appropriate.
5628
5629*/
5630
Alexander Belopolsky40018472011-02-26 01:02:56 +00005631PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005632PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005634 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005635 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005637 int kind;
5638 void *data;
5639 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640
Ezio Melottie7f90372012-10-05 03:33:31 +03005641 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005642 escape.
5643
Ezio Melottie7f90372012-10-05 03:33:31 +03005644 For UCS1 strings it's '\xxx', 4 bytes per source character.
5645 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5646 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005647 */
5648
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005649 if (!PyUnicode_Check(unicode)) {
5650 PyErr_BadArgument();
5651 return NULL;
5652 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005653 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005654 return NULL;
5655 len = PyUnicode_GET_LENGTH(unicode);
5656 kind = PyUnicode_KIND(unicode);
5657 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005658 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005659 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5660 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5661 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5662 }
5663
5664 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005665 return PyBytes_FromStringAndSize(NULL, 0);
5666
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005667 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005669
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005670 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005672 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005673 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 if (repr == NULL)
5675 return NULL;
5676
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005677 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005679 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005680 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005681
Walter Dörwald79e913e2007-05-12 11:08:06 +00005682 /* Escape backslashes */
5683 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 *p++ = '\\';
5685 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005686 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005687 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005688
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005689 /* Map 21-bit characters to '\U00xxxxxx' */
5690 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005691 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005692 *p++ = '\\';
5693 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005694 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5695 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5696 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5697 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5698 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5699 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5700 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5701 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005703 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005704
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005706 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 *p++ = '\\';
5708 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005709 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5710 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5711 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5712 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005714
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005715 /* Map special whitespace to '\t', \n', '\r' */
5716 else if (ch == '\t') {
5717 *p++ = '\\';
5718 *p++ = 't';
5719 }
5720 else if (ch == '\n') {
5721 *p++ = '\\';
5722 *p++ = 'n';
5723 }
5724 else if (ch == '\r') {
5725 *p++ = '\\';
5726 *p++ = 'r';
5727 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005728
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005729 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005730 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005732 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005733 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5734 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005735 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005736
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 /* Copy everything else as-is */
5738 else
5739 *p++ = (char) ch;
5740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005742 assert(p - PyBytes_AS_STRING(repr) > 0);
5743 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5744 return NULL;
5745 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746}
5747
Alexander Belopolsky40018472011-02-26 01:02:56 +00005748PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005749PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5750 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005752 PyObject *result;
5753 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5754 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005756 result = PyUnicode_AsUnicodeEscapeString(tmp);
5757 Py_DECREF(tmp);
5758 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759}
5760
5761/* --- Raw Unicode Escape Codec ------------------------------------------- */
5762
Alexander Belopolsky40018472011-02-26 01:02:56 +00005763PyObject *
5764PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005765 Py_ssize_t size,
5766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005769 Py_ssize_t startinpos;
5770 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005771 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 const char *end;
5773 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 PyObject *errorHandler = NULL;
5775 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005776
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005777 if (size == 0) {
5778 Py_INCREF(unicode_empty);
5779 return unicode_empty;
5780 }
5781
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 /* Escaped strings will always be longer than the resulting
5783 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 length after conversion to the true value. (But decoding error
5785 handler might have to resize the string) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005786 _PyUnicodeWriter_Init(&writer, 1);
5787 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005789
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 end = s + size;
5791 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 unsigned char c;
5793 Py_UCS4 x;
5794 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005795 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796
Benjamin Peterson29060642009-01-31 22:14:21 +00005797 /* Non-escape characters are interpreted as Unicode ordinals */
5798 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005799 x = (unsigned char)*s++;
5800 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005801 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005802 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5803 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005804 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005805 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 startinpos = s-starts;
5807
5808 /* \u-escapes are only interpreted iff the number of leading
5809 backslashes if odd */
5810 bs = s;
5811 for (;s < end;) {
5812 if (*s != '\\')
5813 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005814 x = (unsigned char)*s++;
5815 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005816 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005817 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5818 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 }
5820 if (((s - bs) & 1) == 0 ||
5821 s >= end ||
5822 (*s != 'u' && *s != 'U')) {
5823 continue;
5824 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005825 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 count = *s=='u' ? 4 : 8;
5827 s++;
5828
5829 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 for (x = 0, i = 0; i < count; ++i, ++s) {
5831 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005832 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005834 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 errors, &errorHandler,
5836 "rawunicodeescape", "truncated \\uXXXX",
5837 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005838 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 goto onError;
5840 goto nextByte;
5841 }
5842 x = (x<<4) & ~0xF;
5843 if (c >= '0' && c <= '9')
5844 x += c - '0';
5845 else if (c >= 'a' && c <= 'f')
5846 x += 10 + c - 'a';
5847 else
5848 x += 10 + c - 'A';
5849 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005850 if (x <= MAX_UNICODE) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005851 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005852 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005853 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5854 writer.pos++;
5855 }
5856 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005857 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005858 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005859 errors, &errorHandler,
5860 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005862 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005864 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 nextByte:
5866 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868 Py_XDECREF(errorHandler);
5869 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005870 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005871
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005873 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 Py_XDECREF(errorHandler);
5875 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 return NULL;
5877}
5878
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005879
Alexander Belopolsky40018472011-02-26 01:02:56 +00005880PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005881PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005883 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 char *p;
5885 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005886 Py_ssize_t expandsize, pos;
5887 int kind;
5888 void *data;
5889 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 if (!PyUnicode_Check(unicode)) {
5892 PyErr_BadArgument();
5893 return NULL;
5894 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005895 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005896 return NULL;
5897 kind = PyUnicode_KIND(unicode);
5898 data = PyUnicode_DATA(unicode);
5899 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005900 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5901 bytes, and 1 byte characters 4. */
5902 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005903
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005904 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005906
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005907 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 if (repr == NULL)
5909 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005910 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005911 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005913 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005914 for (pos = 0; pos < len; pos++) {
5915 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 /* Map 32-bit characters to '\Uxxxxxxxx' */
5917 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005918 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005919 *p++ = '\\';
5920 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005921 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5922 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5923 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5924 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5925 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5926 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5927 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5928 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005929 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005931 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 *p++ = '\\';
5933 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005934 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5935 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5936 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5937 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 /* Copy everything else as-is */
5940 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 *p++ = (char) ch;
5942 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005943
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005944 assert(p > q);
5945 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005946 return NULL;
5947 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948}
5949
Alexander Belopolsky40018472011-02-26 01:02:56 +00005950PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005951PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5952 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005954 PyObject *result;
5955 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5956 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005957 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005958 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5959 Py_DECREF(tmp);
5960 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961}
5962
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005963/* --- Unicode Internal Codec ------------------------------------------- */
5964
Alexander Belopolsky40018472011-02-26 01:02:56 +00005965PyObject *
5966_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005967 Py_ssize_t size,
5968 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005969{
5970 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005971 Py_ssize_t startinpos;
5972 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005973 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005974 const char *end;
5975 const char *reason;
5976 PyObject *errorHandler = NULL;
5977 PyObject *exc = NULL;
5978
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005979 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005980 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005981 1))
5982 return NULL;
5983
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005984 if (size == 0) {
5985 Py_INCREF(unicode_empty);
5986 return unicode_empty;
5987 }
5988
Thomas Wouters89f507f2006-12-13 04:49:30 +00005989 /* XXX overflow detection missing */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005990 _PyUnicodeWriter_Init(&writer, 0);
5991 if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005993 end = s + size;
5994
5995 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005996 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005997 Py_UCS4 ch;
5998 /* We copy the raw representation one byte at a time because the
5999 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006000 ((char *) &uch)[0] = s[0];
6001 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006002#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006003 ((char *) &uch)[2] = s[2];
6004 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006005#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006006 ch = uch;
6007
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006008 /* We have to sanity check the raw data, otherwise doom looms for
6009 some malformed UCS-4 data. */
6010 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006011#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006012 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006013#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006014 end-s < Py_UNICODE_SIZE
6015 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006017 startinpos = s - starts;
6018 if (end-s < Py_UNICODE_SIZE) {
6019 endinpos = end-starts;
6020 reason = "truncated input";
6021 }
6022 else {
6023 endinpos = s - starts + Py_UNICODE_SIZE;
6024 reason = "illegal code point (> 0x10FFFF)";
6025 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006026 if (unicode_decode_call_errorhandler_writer(
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006027 errors, &errorHandler,
6028 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006029 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006030 &writer))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006031 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006032 continue;
6033 }
6034
6035 s += Py_UNICODE_SIZE;
6036#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006037 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006038 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006039 Py_UNICODE uch2;
6040 ((char *) &uch2)[0] = s[0];
6041 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006042 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006043 {
Victor Stinner551ac952011-11-29 22:58:13 +01006044 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006045 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006046 }
6047 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006048#endif
6049
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006050 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006051 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006052 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
6053 writer.pos++;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006054 }
6055
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006056 Py_XDECREF(errorHandler);
6057 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006058 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006059
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006061 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006062 Py_XDECREF(errorHandler);
6063 Py_XDECREF(exc);
6064 return NULL;
6065}
6066
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067/* --- Latin-1 Codec ------------------------------------------------------ */
6068
Alexander Belopolsky40018472011-02-26 01:02:56 +00006069PyObject *
6070PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006071 Py_ssize_t size,
6072 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006075 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076}
6077
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006078/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006079static void
6080make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006081 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006082 PyObject *unicode,
6083 Py_ssize_t startpos, Py_ssize_t endpos,
6084 const char *reason)
6085{
6086 if (*exceptionObject == NULL) {
6087 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006088 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006089 encoding, unicode, startpos, endpos, reason);
6090 }
6091 else {
6092 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6093 goto onError;
6094 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6095 goto onError;
6096 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6097 goto onError;
6098 return;
6099 onError:
6100 Py_DECREF(*exceptionObject);
6101 *exceptionObject = NULL;
6102 }
6103}
6104
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006105/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006106static void
6107raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006108 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006109 PyObject *unicode,
6110 Py_ssize_t startpos, Py_ssize_t endpos,
6111 const char *reason)
6112{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006113 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006114 encoding, unicode, startpos, endpos, reason);
6115 if (*exceptionObject != NULL)
6116 PyCodec_StrictErrors(*exceptionObject);
6117}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006118
6119/* error handling callback helper:
6120 build arguments, call the callback and check the arguments,
6121 put the result into newpos and return the replacement string, which
6122 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006123static PyObject *
6124unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006125 PyObject **errorHandler,
6126 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006127 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006128 Py_ssize_t startpos, Py_ssize_t endpos,
6129 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006130{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006131 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006132 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133 PyObject *restuple;
6134 PyObject *resunicode;
6135
6136 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140 }
6141
Benjamin Petersonbac79492012-01-14 13:34:47 -05006142 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006143 return NULL;
6144 len = PyUnicode_GET_LENGTH(unicode);
6145
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006146 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006147 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006148 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006150
6151 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006153 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006155 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006156 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 Py_DECREF(restuple);
6158 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006159 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006160 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 &resunicode, newpos)) {
6162 Py_DECREF(restuple);
6163 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006165 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6166 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6167 Py_DECREF(restuple);
6168 return NULL;
6169 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006170 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006171 *newpos = len + *newpos;
6172 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6174 Py_DECREF(restuple);
6175 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006176 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006177 Py_INCREF(resunicode);
6178 Py_DECREF(restuple);
6179 return resunicode;
6180}
6181
Alexander Belopolsky40018472011-02-26 01:02:56 +00006182static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006183unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006184 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006185 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006186{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006187 /* input state */
6188 Py_ssize_t pos=0, size;
6189 int kind;
6190 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006191 /* output object */
6192 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006193 /* pointer into the output */
6194 char *str;
6195 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006196 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006197 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6198 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006199 PyObject *errorHandler = NULL;
6200 PyObject *exc = NULL;
6201 /* the following variable is used for caching string comparisons
6202 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6203 int known_errorHandler = -1;
6204
Benjamin Petersonbac79492012-01-14 13:34:47 -05006205 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006206 return NULL;
6207 size = PyUnicode_GET_LENGTH(unicode);
6208 kind = PyUnicode_KIND(unicode);
6209 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006210 /* allocate enough for a simple encoding without
6211 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006212 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006213 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006214 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006215 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006216 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006217 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006218 ressize = size;
6219
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006220 while (pos < size) {
6221 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006222
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 /* can we encode this? */
6224 if (c<limit) {
6225 /* no overflow check, because we know that the space is enough */
6226 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006227 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006228 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 Py_ssize_t requiredsize;
6231 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006232 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006234 Py_ssize_t collstart = pos;
6235 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006237 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 ++collend;
6239 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6240 if (known_errorHandler==-1) {
6241 if ((errors==NULL) || (!strcmp(errors, "strict")))
6242 known_errorHandler = 1;
6243 else if (!strcmp(errors, "replace"))
6244 known_errorHandler = 2;
6245 else if (!strcmp(errors, "ignore"))
6246 known_errorHandler = 3;
6247 else if (!strcmp(errors, "xmlcharrefreplace"))
6248 known_errorHandler = 4;
6249 else
6250 known_errorHandler = 0;
6251 }
6252 switch (known_errorHandler) {
6253 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006254 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 goto onError;
6256 case 2: /* replace */
6257 while (collstart++<collend)
6258 *str++ = '?'; /* fall through */
6259 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006260 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 break;
6262 case 4: /* xmlcharrefreplace */
6263 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006264 /* determine replacement size */
6265 for (i = collstart, repsize = 0; i < collend; ++i) {
6266 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6267 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006269 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006271 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006273 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006275 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006277 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006279 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006280 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006282 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006284 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 if (requiredsize > ressize) {
6286 if (requiredsize<2*ressize)
6287 requiredsize = 2*ressize;
6288 if (_PyBytes_Resize(&res, requiredsize))
6289 goto onError;
6290 str = PyBytes_AS_STRING(res) + respos;
6291 ressize = requiredsize;
6292 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006293 /* generate replacement */
6294 for (i = collstart; i < collend; ++i) {
6295 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006297 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 break;
6299 default:
6300 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006301 encoding, reason, unicode, &exc,
6302 collstart, collend, &newpos);
6303 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006304 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006306 if (PyBytes_Check(repunicode)) {
6307 /* Directly copy bytes result to output. */
6308 repsize = PyBytes_Size(repunicode);
6309 if (repsize > 1) {
6310 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006311 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006312 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6313 Py_DECREF(repunicode);
6314 goto onError;
6315 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006316 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006317 ressize += repsize-1;
6318 }
6319 memcpy(str, PyBytes_AsString(repunicode), repsize);
6320 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006321 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006322 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006323 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006324 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 /* need more space? (at least enough for what we
6326 have+the replacement+the rest of the string, so
6327 we won't have to check space for encodable characters) */
6328 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006329 repsize = PyUnicode_GET_LENGTH(repunicode);
6330 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 if (requiredsize > ressize) {
6332 if (requiredsize<2*ressize)
6333 requiredsize = 2*ressize;
6334 if (_PyBytes_Resize(&res, requiredsize)) {
6335 Py_DECREF(repunicode);
6336 goto onError;
6337 }
6338 str = PyBytes_AS_STRING(res) + respos;
6339 ressize = requiredsize;
6340 }
6341 /* check if there is anything unencodable in the replacement
6342 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006343 for (i = 0; repsize-->0; ++i, ++str) {
6344 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006346 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006347 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 Py_DECREF(repunicode);
6349 goto onError;
6350 }
6351 *str = (char)c;
6352 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006353 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006354 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006356 }
6357 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006358 /* Resize if we allocated to much */
6359 size = str - PyBytes_AS_STRING(res);
6360 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006361 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006362 if (_PyBytes_Resize(&res, size) < 0)
6363 goto onError;
6364 }
6365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006366 Py_XDECREF(errorHandler);
6367 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006368 return res;
6369
6370 onError:
6371 Py_XDECREF(res);
6372 Py_XDECREF(errorHandler);
6373 Py_XDECREF(exc);
6374 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375}
6376
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006377/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006378PyObject *
6379PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006380 Py_ssize_t size,
6381 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006383 PyObject *result;
6384 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6385 if (unicode == NULL)
6386 return NULL;
6387 result = unicode_encode_ucs1(unicode, errors, 256);
6388 Py_DECREF(unicode);
6389 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390}
6391
Alexander Belopolsky40018472011-02-26 01:02:56 +00006392PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006393_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394{
6395 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 PyErr_BadArgument();
6397 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006399 if (PyUnicode_READY(unicode) == -1)
6400 return NULL;
6401 /* Fast path: if it is a one-byte string, construct
6402 bytes object directly. */
6403 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6404 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6405 PyUnicode_GET_LENGTH(unicode));
6406 /* Non-Latin-1 characters present. Defer to above function to
6407 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006408 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006409}
6410
6411PyObject*
6412PyUnicode_AsLatin1String(PyObject *unicode)
6413{
6414 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415}
6416
6417/* --- 7-bit ASCII Codec -------------------------------------------------- */
6418
Alexander Belopolsky40018472011-02-26 01:02:56 +00006419PyObject *
6420PyUnicode_DecodeASCII(const char *s,
6421 Py_ssize_t size,
6422 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006425 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006426 int kind;
6427 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006428 Py_ssize_t startinpos;
6429 Py_ssize_t endinpos;
6430 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006431 const char *e;
6432 PyObject *errorHandler = NULL;
6433 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006434
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006435 if (size == 0) {
6436 Py_INCREF(unicode_empty);
6437 return unicode_empty;
6438 }
6439
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006441 if (size == 1 && (unsigned char)s[0] < 128)
6442 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006443
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006444 _PyUnicodeWriter_Init(&writer, 0);
6445 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006447
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006448 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006449 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006450 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006451 writer.pos = outpos;
6452 if (writer.pos == size)
6453 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006454
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006455 s += writer.pos;
6456 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 register unsigned char c = (unsigned char)*s;
6459 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006460 PyUnicode_WRITE(kind, data, writer.pos, c);
6461 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 ++s;
6463 }
6464 else {
6465 startinpos = s-starts;
6466 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006467 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 errors, &errorHandler,
6469 "ascii", "ordinal not in range(128)",
6470 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006471 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006473 kind = writer.kind;
6474 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006477 Py_XDECREF(errorHandler);
6478 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006479 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006480
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006482 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 Py_XDECREF(errorHandler);
6484 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 return NULL;
6486}
6487
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006488/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006489PyObject *
6490PyUnicode_EncodeASCII(const Py_UNICODE *p,
6491 Py_ssize_t size,
6492 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006494 PyObject *result;
6495 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6496 if (unicode == NULL)
6497 return NULL;
6498 result = unicode_encode_ucs1(unicode, errors, 128);
6499 Py_DECREF(unicode);
6500 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501}
6502
Alexander Belopolsky40018472011-02-26 01:02:56 +00006503PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006504_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505{
6506 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 PyErr_BadArgument();
6508 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006510 if (PyUnicode_READY(unicode) == -1)
6511 return NULL;
6512 /* Fast path: if it is an ASCII-only string, construct bytes object
6513 directly. Else defer to above function to raise the exception. */
6514 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6515 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6516 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006517 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006518}
6519
6520PyObject *
6521PyUnicode_AsASCIIString(PyObject *unicode)
6522{
6523 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524}
6525
Victor Stinner99b95382011-07-04 14:23:54 +02006526#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006527
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006528/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006529
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006530#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006531#define NEED_RETRY
6532#endif
6533
Victor Stinner3a50e702011-10-18 21:21:00 +02006534#ifndef WC_ERR_INVALID_CHARS
6535# define WC_ERR_INVALID_CHARS 0x0080
6536#endif
6537
6538static char*
6539code_page_name(UINT code_page, PyObject **obj)
6540{
6541 *obj = NULL;
6542 if (code_page == CP_ACP)
6543 return "mbcs";
6544 if (code_page == CP_UTF7)
6545 return "CP_UTF7";
6546 if (code_page == CP_UTF8)
6547 return "CP_UTF8";
6548
6549 *obj = PyBytes_FromFormat("cp%u", code_page);
6550 if (*obj == NULL)
6551 return NULL;
6552 return PyBytes_AS_STRING(*obj);
6553}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006554
Alexander Belopolsky40018472011-02-26 01:02:56 +00006555static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006556is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006557{
6558 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006559 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006560
Victor Stinner3a50e702011-10-18 21:21:00 +02006561 if (!IsDBCSLeadByteEx(code_page, *curr))
6562 return 0;
6563
6564 prev = CharPrevExA(code_page, s, curr, 0);
6565 if (prev == curr)
6566 return 1;
6567 /* FIXME: This code is limited to "true" double-byte encodings,
6568 as it assumes an incomplete character consists of a single
6569 byte. */
6570 if (curr - prev == 2)
6571 return 1;
6572 if (!IsDBCSLeadByteEx(code_page, *prev))
6573 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006574 return 0;
6575}
6576
Victor Stinner3a50e702011-10-18 21:21:00 +02006577static DWORD
6578decode_code_page_flags(UINT code_page)
6579{
6580 if (code_page == CP_UTF7) {
6581 /* The CP_UTF7 decoder only supports flags=0 */
6582 return 0;
6583 }
6584 else
6585 return MB_ERR_INVALID_CHARS;
6586}
6587
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006588/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006589 * Decode a byte string from a Windows code page into unicode object in strict
6590 * mode.
6591 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006592 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6593 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006594 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006595static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006596decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006597 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006598 const char *in,
6599 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006600{
Victor Stinner3a50e702011-10-18 21:21:00 +02006601 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006602 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006603 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006604
6605 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006606 assert(insize > 0);
6607 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6608 if (outsize <= 0)
6609 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006610
6611 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006612 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006613 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006614 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 if (*v == NULL)
6616 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006617 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006618 }
6619 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006621 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006622 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006624 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006625 }
6626
6627 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006628 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6629 if (outsize <= 0)
6630 goto error;
6631 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006632
Victor Stinner3a50e702011-10-18 21:21:00 +02006633error:
6634 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6635 return -2;
6636 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006637 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006638}
6639
Victor Stinner3a50e702011-10-18 21:21:00 +02006640/*
6641 * Decode a byte string from a code page into unicode object with an error
6642 * handler.
6643 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006644 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006645 * UnicodeDecodeError exception and returns -1 on error.
6646 */
6647static int
6648decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006649 PyObject **v,
6650 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006651 const char *errors)
6652{
6653 const char *startin = in;
6654 const char *endin = in + size;
6655 const DWORD flags = decode_code_page_flags(code_page);
6656 /* Ideally, we should get reason from FormatMessage. This is the Windows
6657 2000 English version of the message. */
6658 const char *reason = "No mapping for the Unicode character exists "
6659 "in the target code page.";
6660 /* each step cannot decode more than 1 character, but a character can be
6661 represented as a surrogate pair */
6662 wchar_t buffer[2], *startout, *out;
6663 int insize, outsize;
6664 PyObject *errorHandler = NULL;
6665 PyObject *exc = NULL;
6666 PyObject *encoding_obj = NULL;
6667 char *encoding;
6668 DWORD err;
6669 int ret = -1;
6670
6671 assert(size > 0);
6672
6673 encoding = code_page_name(code_page, &encoding_obj);
6674 if (encoding == NULL)
6675 return -1;
6676
6677 if (errors == NULL || strcmp(errors, "strict") == 0) {
6678 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6679 UnicodeDecodeError. */
6680 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6681 if (exc != NULL) {
6682 PyCodec_StrictErrors(exc);
6683 Py_CLEAR(exc);
6684 }
6685 goto error;
6686 }
6687
6688 if (*v == NULL) {
6689 /* Create unicode object */
6690 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6691 PyErr_NoMemory();
6692 goto error;
6693 }
Victor Stinnerab595942011-12-17 04:59:06 +01006694 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006695 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006696 if (*v == NULL)
6697 goto error;
6698 startout = PyUnicode_AS_UNICODE(*v);
6699 }
6700 else {
6701 /* Extend unicode object */
6702 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6703 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6704 PyErr_NoMemory();
6705 goto error;
6706 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006707 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006708 goto error;
6709 startout = PyUnicode_AS_UNICODE(*v) + n;
6710 }
6711
6712 /* Decode the byte string character per character */
6713 out = startout;
6714 while (in < endin)
6715 {
6716 /* Decode a character */
6717 insize = 1;
6718 do
6719 {
6720 outsize = MultiByteToWideChar(code_page, flags,
6721 in, insize,
6722 buffer, Py_ARRAY_LENGTH(buffer));
6723 if (outsize > 0)
6724 break;
6725 err = GetLastError();
6726 if (err != ERROR_NO_UNICODE_TRANSLATION
6727 && err != ERROR_INSUFFICIENT_BUFFER)
6728 {
6729 PyErr_SetFromWindowsErr(0);
6730 goto error;
6731 }
6732 insize++;
6733 }
6734 /* 4=maximum length of a UTF-8 sequence */
6735 while (insize <= 4 && (in + insize) <= endin);
6736
6737 if (outsize <= 0) {
6738 Py_ssize_t startinpos, endinpos, outpos;
6739
6740 startinpos = in - startin;
6741 endinpos = startinpos + 1;
6742 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006743 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006744 errors, &errorHandler,
6745 encoding, reason,
6746 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006747 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006748 {
6749 goto error;
6750 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006751 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006752 }
6753 else {
6754 in += insize;
6755 memcpy(out, buffer, outsize * sizeof(wchar_t));
6756 out += outsize;
6757 }
6758 }
6759
6760 /* write a NUL character at the end */
6761 *out = 0;
6762
6763 /* Extend unicode object */
6764 outsize = out - startout;
6765 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006766 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006767 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006768 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006769
6770error:
6771 Py_XDECREF(encoding_obj);
6772 Py_XDECREF(errorHandler);
6773 Py_XDECREF(exc);
6774 return ret;
6775}
6776
Victor Stinner3a50e702011-10-18 21:21:00 +02006777static PyObject *
6778decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006779 const char *s, Py_ssize_t size,
6780 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006781{
Victor Stinner76a31a62011-11-04 00:05:13 +01006782 PyObject *v = NULL;
6783 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006784
Victor Stinner3a50e702011-10-18 21:21:00 +02006785 if (code_page < 0) {
6786 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6787 return NULL;
6788 }
6789
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006790 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006792
Victor Stinner76a31a62011-11-04 00:05:13 +01006793 do
6794 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006795#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006796 if (size > INT_MAX) {
6797 chunk_size = INT_MAX;
6798 final = 0;
6799 done = 0;
6800 }
6801 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006802#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006803 {
6804 chunk_size = (int)size;
6805 final = (consumed == NULL);
6806 done = 1;
6807 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006808
Victor Stinner76a31a62011-11-04 00:05:13 +01006809 /* Skip trailing lead-byte unless 'final' is set */
6810 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6811 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006812
Victor Stinner76a31a62011-11-04 00:05:13 +01006813 if (chunk_size == 0 && done) {
6814 if (v != NULL)
6815 break;
6816 Py_INCREF(unicode_empty);
6817 return unicode_empty;
6818 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
Victor Stinner76a31a62011-11-04 00:05:13 +01006820
6821 converted = decode_code_page_strict(code_page, &v,
6822 s, chunk_size);
6823 if (converted == -2)
6824 converted = decode_code_page_errors(code_page, &v,
6825 s, chunk_size,
6826 errors);
6827 assert(converted != 0);
6828
6829 if (converted < 0) {
6830 Py_XDECREF(v);
6831 return NULL;
6832 }
6833
6834 if (consumed)
6835 *consumed += converted;
6836
6837 s += converted;
6838 size -= converted;
6839 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006840
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006841 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006842}
6843
Alexander Belopolsky40018472011-02-26 01:02:56 +00006844PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006845PyUnicode_DecodeCodePageStateful(int code_page,
6846 const char *s,
6847 Py_ssize_t size,
6848 const char *errors,
6849 Py_ssize_t *consumed)
6850{
6851 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6852}
6853
6854PyObject *
6855PyUnicode_DecodeMBCSStateful(const char *s,
6856 Py_ssize_t size,
6857 const char *errors,
6858 Py_ssize_t *consumed)
6859{
6860 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6861}
6862
6863PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006864PyUnicode_DecodeMBCS(const char *s,
6865 Py_ssize_t size,
6866 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006867{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6869}
6870
Victor Stinner3a50e702011-10-18 21:21:00 +02006871static DWORD
6872encode_code_page_flags(UINT code_page, const char *errors)
6873{
6874 if (code_page == CP_UTF8) {
6875 if (winver.dwMajorVersion >= 6)
6876 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6877 and later */
6878 return WC_ERR_INVALID_CHARS;
6879 else
6880 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6881 return 0;
6882 }
6883 else if (code_page == CP_UTF7) {
6884 /* CP_UTF7 only supports flags=0 */
6885 return 0;
6886 }
6887 else {
6888 if (errors != NULL && strcmp(errors, "replace") == 0)
6889 return 0;
6890 else
6891 return WC_NO_BEST_FIT_CHARS;
6892 }
6893}
6894
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006895/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006896 * Encode a Unicode string to a Windows code page into a byte string in strict
6897 * mode.
6898 *
6899 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006900 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006901 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006902static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006903encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006904 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006905 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006906{
Victor Stinner554f3f02010-06-16 23:33:54 +00006907 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 BOOL *pusedDefaultChar = &usedDefaultChar;
6909 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006910 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006911 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006912 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006913 const DWORD flags = encode_code_page_flags(code_page, NULL);
6914 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006915 /* Create a substring so that we can get the UTF-16 representation
6916 of just the slice under consideration. */
6917 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006918
Martin v. Löwis3d325192011-11-04 18:23:06 +01006919 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006920
Victor Stinner3a50e702011-10-18 21:21:00 +02006921 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006922 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006923 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006924 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006925
Victor Stinner2fc507f2011-11-04 20:06:39 +01006926 substring = PyUnicode_Substring(unicode, offset, offset+len);
6927 if (substring == NULL)
6928 return -1;
6929 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6930 if (p == NULL) {
6931 Py_DECREF(substring);
6932 return -1;
6933 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006934
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006935 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006936 outsize = WideCharToMultiByte(code_page, flags,
6937 p, size,
6938 NULL, 0,
6939 NULL, pusedDefaultChar);
6940 if (outsize <= 0)
6941 goto error;
6942 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006943 if (pusedDefaultChar && *pusedDefaultChar) {
6944 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006945 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006946 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006947
Victor Stinner3a50e702011-10-18 21:21:00 +02006948 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006950 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006951 if (*outbytes == NULL) {
6952 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006954 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006955 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006956 }
6957 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006958 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006959 const Py_ssize_t n = PyBytes_Size(*outbytes);
6960 if (outsize > PY_SSIZE_T_MAX - n) {
6961 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006962 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006963 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006964 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006965 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6966 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006967 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006968 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006969 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006970 }
6971
6972 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006973 outsize = WideCharToMultiByte(code_page, flags,
6974 p, size,
6975 out, outsize,
6976 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006977 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006978 if (outsize <= 0)
6979 goto error;
6980 if (pusedDefaultChar && *pusedDefaultChar)
6981 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006982 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006983
Victor Stinner3a50e702011-10-18 21:21:00 +02006984error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006985 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006986 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6987 return -2;
6988 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006989 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006990}
6991
Victor Stinner3a50e702011-10-18 21:21:00 +02006992/*
6993 * Encode a Unicode string to a Windows code page into a byte string using a
6994 * error handler.
6995 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006996 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02006997 * -1 on other error.
6998 */
6999static int
7000encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007001 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007002 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007003{
Victor Stinner3a50e702011-10-18 21:21:00 +02007004 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007005 Py_ssize_t pos = unicode_offset;
7006 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007007 /* Ideally, we should get reason from FormatMessage. This is the Windows
7008 2000 English version of the message. */
7009 const char *reason = "invalid character";
7010 /* 4=maximum length of a UTF-8 sequence */
7011 char buffer[4];
7012 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7013 Py_ssize_t outsize;
7014 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 PyObject *errorHandler = NULL;
7016 PyObject *exc = NULL;
7017 PyObject *encoding_obj = NULL;
7018 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007019 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007020 PyObject *rep;
7021 int ret = -1;
7022
7023 assert(insize > 0);
7024
7025 encoding = code_page_name(code_page, &encoding_obj);
7026 if (encoding == NULL)
7027 return -1;
7028
7029 if (errors == NULL || strcmp(errors, "strict") == 0) {
7030 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7031 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007032 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007033 if (exc != NULL) {
7034 PyCodec_StrictErrors(exc);
7035 Py_DECREF(exc);
7036 }
7037 Py_XDECREF(encoding_obj);
7038 return -1;
7039 }
7040
7041 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7042 pusedDefaultChar = &usedDefaultChar;
7043 else
7044 pusedDefaultChar = NULL;
7045
7046 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7047 PyErr_NoMemory();
7048 goto error;
7049 }
7050 outsize = insize * Py_ARRAY_LENGTH(buffer);
7051
7052 if (*outbytes == NULL) {
7053 /* Create string object */
7054 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7055 if (*outbytes == NULL)
7056 goto error;
7057 out = PyBytes_AS_STRING(*outbytes);
7058 }
7059 else {
7060 /* Extend string object */
7061 Py_ssize_t n = PyBytes_Size(*outbytes);
7062 if (n > PY_SSIZE_T_MAX - outsize) {
7063 PyErr_NoMemory();
7064 goto error;
7065 }
7066 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7067 goto error;
7068 out = PyBytes_AS_STRING(*outbytes) + n;
7069 }
7070
7071 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007072 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007073 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007074 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7075 wchar_t chars[2];
7076 int charsize;
7077 if (ch < 0x10000) {
7078 chars[0] = (wchar_t)ch;
7079 charsize = 1;
7080 }
7081 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007082 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7083 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007084 charsize = 2;
7085 }
7086
Victor Stinner3a50e702011-10-18 21:21:00 +02007087 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007088 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007089 buffer, Py_ARRAY_LENGTH(buffer),
7090 NULL, pusedDefaultChar);
7091 if (outsize > 0) {
7092 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7093 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007094 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007095 memcpy(out, buffer, outsize);
7096 out += outsize;
7097 continue;
7098 }
7099 }
7100 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7101 PyErr_SetFromWindowsErr(0);
7102 goto error;
7103 }
7104
Victor Stinner3a50e702011-10-18 21:21:00 +02007105 rep = unicode_encode_call_errorhandler(
7106 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007107 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007108 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007109 if (rep == NULL)
7110 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007111 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007112
7113 if (PyBytes_Check(rep)) {
7114 outsize = PyBytes_GET_SIZE(rep);
7115 if (outsize != 1) {
7116 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7117 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7118 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7119 Py_DECREF(rep);
7120 goto error;
7121 }
7122 out = PyBytes_AS_STRING(*outbytes) + offset;
7123 }
7124 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7125 out += outsize;
7126 }
7127 else {
7128 Py_ssize_t i;
7129 enum PyUnicode_Kind kind;
7130 void *data;
7131
Benjamin Petersonbac79492012-01-14 13:34:47 -05007132 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007133 Py_DECREF(rep);
7134 goto error;
7135 }
7136
7137 outsize = PyUnicode_GET_LENGTH(rep);
7138 if (outsize != 1) {
7139 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7140 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7141 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7142 Py_DECREF(rep);
7143 goto error;
7144 }
7145 out = PyBytes_AS_STRING(*outbytes) + offset;
7146 }
7147 kind = PyUnicode_KIND(rep);
7148 data = PyUnicode_DATA(rep);
7149 for (i=0; i < outsize; i++) {
7150 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7151 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007152 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007153 encoding, unicode,
7154 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 "unable to encode error handler result to ASCII");
7156 Py_DECREF(rep);
7157 goto error;
7158 }
7159 *out = (unsigned char)ch;
7160 out++;
7161 }
7162 }
7163 Py_DECREF(rep);
7164 }
7165 /* write a NUL byte */
7166 *out = 0;
7167 outsize = out - PyBytes_AS_STRING(*outbytes);
7168 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7169 if (_PyBytes_Resize(outbytes, outsize) < 0)
7170 goto error;
7171 ret = 0;
7172
7173error:
7174 Py_XDECREF(encoding_obj);
7175 Py_XDECREF(errorHandler);
7176 Py_XDECREF(exc);
7177 return ret;
7178}
7179
Victor Stinner3a50e702011-10-18 21:21:00 +02007180static PyObject *
7181encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007182 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 const char *errors)
7184{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007185 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007187 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007188 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007189
Benjamin Petersonbac79492012-01-14 13:34:47 -05007190 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007191 return NULL;
7192 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007193
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 if (code_page < 0) {
7195 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7196 return NULL;
7197 }
7198
Martin v. Löwis3d325192011-11-04 18:23:06 +01007199 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007200 return PyBytes_FromStringAndSize(NULL, 0);
7201
Victor Stinner7581cef2011-11-03 22:32:33 +01007202 offset = 0;
7203 do
7204 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007205#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007206 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007207 chunks. */
7208 if (len > INT_MAX/2) {
7209 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007210 done = 0;
7211 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007212 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007213#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007214 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007215 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007216 done = 1;
7217 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007218
Victor Stinner76a31a62011-11-04 00:05:13 +01007219 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007220 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007221 errors);
7222 if (ret == -2)
7223 ret = encode_code_page_errors(code_page, &outbytes,
7224 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007225 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007226 if (ret < 0) {
7227 Py_XDECREF(outbytes);
7228 return NULL;
7229 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007230
Victor Stinner7581cef2011-11-03 22:32:33 +01007231 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007232 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007233 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007234
Victor Stinner3a50e702011-10-18 21:21:00 +02007235 return outbytes;
7236}
7237
7238PyObject *
7239PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7240 Py_ssize_t size,
7241 const char *errors)
7242{
Victor Stinner7581cef2011-11-03 22:32:33 +01007243 PyObject *unicode, *res;
7244 unicode = PyUnicode_FromUnicode(p, size);
7245 if (unicode == NULL)
7246 return NULL;
7247 res = encode_code_page(CP_ACP, unicode, errors);
7248 Py_DECREF(unicode);
7249 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007250}
7251
7252PyObject *
7253PyUnicode_EncodeCodePage(int code_page,
7254 PyObject *unicode,
7255 const char *errors)
7256{
Victor Stinner7581cef2011-11-03 22:32:33 +01007257 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007258}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007259
Alexander Belopolsky40018472011-02-26 01:02:56 +00007260PyObject *
7261PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007262{
7263 if (!PyUnicode_Check(unicode)) {
7264 PyErr_BadArgument();
7265 return NULL;
7266 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007267 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007268}
7269
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007270#undef NEED_RETRY
7271
Victor Stinner99b95382011-07-04 14:23:54 +02007272#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007273
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274/* --- Character Mapping Codec -------------------------------------------- */
7275
Alexander Belopolsky40018472011-02-26 01:02:56 +00007276PyObject *
7277PyUnicode_DecodeCharmap(const char *s,
7278 Py_ssize_t size,
7279 PyObject *mapping,
7280 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007282 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007283 Py_ssize_t startinpos;
7284 Py_ssize_t endinpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007285 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007286 _PyUnicodeWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007287 PyObject *errorHandler = NULL;
7288 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007289
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290 /* Default to Latin-1 */
7291 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007292 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007294 if (size == 0) {
7295 Py_INCREF(unicode_empty);
7296 return unicode_empty;
7297 }
7298 _PyUnicodeWriter_Init(&writer, 0);
7299 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007301
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007302 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007303 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007304 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007305 enum PyUnicode_Kind mapkind;
7306 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007307 Py_UCS4 x;
7308
Benjamin Petersonbac79492012-01-14 13:34:47 -05007309 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007310 return NULL;
7311
7312 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007313 mapdata = PyUnicode_DATA(mapping);
7314 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007316 unsigned char ch;
7317 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007318 enum PyUnicode_Kind outkind = writer.kind;
7319 void *outdata = writer.data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007320 if (outkind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007321 Py_UCS4 maxchar = writer.maxchar;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007322 while (s < e) {
7323 unsigned char ch = *s;
7324 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7325 if (x > maxchar)
7326 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007327 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, writer.pos, x);
7328 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007329 ++s;
7330 }
7331 break;
7332 }
7333 else if (outkind == PyUnicode_2BYTE_KIND) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007334 while (s < e) {
7335 unsigned char ch = *s;
7336 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7337 if (x == 0xFFFE)
7338 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007339 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, writer.pos, x);
7340 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007341 ++s;
7342 }
7343 break;
7344 }
7345 }
7346 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007349 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007350 else
7351 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007352Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007353 if (x == 0xfffe)
7354 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 startinpos = s-starts;
7357 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007358 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 errors, &errorHandler,
7360 "charmap", "character maps to <undefined>",
7361 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007362 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 goto onError;
7364 }
7365 continue;
7366 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007367
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007368 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007369 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007370 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
7371 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007372 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007373 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007374 }
7375 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 while (s < e) {
7377 unsigned char ch = *s;
7378 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007379
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7381 w = PyLong_FromLong((long)ch);
7382 if (w == NULL)
7383 goto onError;
7384 x = PyObject_GetItem(mapping, w);
7385 Py_DECREF(w);
7386 if (x == NULL) {
7387 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7388 /* No mapping found means: mapping is undefined. */
7389 PyErr_Clear();
7390 x = Py_None;
7391 Py_INCREF(x);
7392 } else
7393 goto onError;
7394 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007395
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 /* Apply mapping */
7397 if (PyLong_Check(x)) {
7398 long value = PyLong_AS_LONG(x);
Antoine Pitroua1f76552012-09-23 20:00:04 +02007399 if (value < 0 || value > MAX_UNICODE) {
7400 PyErr_Format(PyExc_TypeError,
7401 "character mapping must be in range(0x%lx)",
7402 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 Py_DECREF(x);
7404 goto onError;
7405 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007406
7407 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007408 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007409 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7410 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 }
7412 else if (x == Py_None) {
7413 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 startinpos = s-starts;
7415 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007416 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 errors, &errorHandler,
7418 "charmap", "character maps to <undefined>",
7419 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007420 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 Py_DECREF(x);
7422 goto onError;
7423 }
7424 Py_DECREF(x);
7425 continue;
7426 }
7427 else if (PyUnicode_Check(x)) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007428 writer.overallocate = 1;
7429 if (_PyUnicodeWriter_WriteStr(&writer, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007430 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 }
7432 else {
7433 /* wrong return value */
7434 PyErr_SetString(PyExc_TypeError,
7435 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007436 Py_DECREF(x);
7437 goto onError;
7438 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 Py_DECREF(x);
7440 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007443 Py_XDECREF(errorHandler);
7444 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007445 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007446
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007448 Py_XDECREF(errorHandler);
7449 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007450 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 return NULL;
7452}
7453
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007454/* Charmap encoding: the lookup table */
7455
Alexander Belopolsky40018472011-02-26 01:02:56 +00007456struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 PyObject_HEAD
7458 unsigned char level1[32];
7459 int count2, count3;
7460 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007461};
7462
7463static PyObject*
7464encoding_map_size(PyObject *obj, PyObject* args)
7465{
7466 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007467 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007469}
7470
7471static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007472 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 PyDoc_STR("Return the size (in bytes) of this object") },
7474 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007475};
7476
7477static void
7478encoding_map_dealloc(PyObject* o)
7479{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007480 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007481}
7482
7483static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007484 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 "EncodingMap", /*tp_name*/
7486 sizeof(struct encoding_map), /*tp_basicsize*/
7487 0, /*tp_itemsize*/
7488 /* methods */
7489 encoding_map_dealloc, /*tp_dealloc*/
7490 0, /*tp_print*/
7491 0, /*tp_getattr*/
7492 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007493 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 0, /*tp_repr*/
7495 0, /*tp_as_number*/
7496 0, /*tp_as_sequence*/
7497 0, /*tp_as_mapping*/
7498 0, /*tp_hash*/
7499 0, /*tp_call*/
7500 0, /*tp_str*/
7501 0, /*tp_getattro*/
7502 0, /*tp_setattro*/
7503 0, /*tp_as_buffer*/
7504 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7505 0, /*tp_doc*/
7506 0, /*tp_traverse*/
7507 0, /*tp_clear*/
7508 0, /*tp_richcompare*/
7509 0, /*tp_weaklistoffset*/
7510 0, /*tp_iter*/
7511 0, /*tp_iternext*/
7512 encoding_map_methods, /*tp_methods*/
7513 0, /*tp_members*/
7514 0, /*tp_getset*/
7515 0, /*tp_base*/
7516 0, /*tp_dict*/
7517 0, /*tp_descr_get*/
7518 0, /*tp_descr_set*/
7519 0, /*tp_dictoffset*/
7520 0, /*tp_init*/
7521 0, /*tp_alloc*/
7522 0, /*tp_new*/
7523 0, /*tp_free*/
7524 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007525};
7526
7527PyObject*
7528PyUnicode_BuildEncodingMap(PyObject* string)
7529{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007530 PyObject *result;
7531 struct encoding_map *mresult;
7532 int i;
7533 int need_dict = 0;
7534 unsigned char level1[32];
7535 unsigned char level2[512];
7536 unsigned char *mlevel1, *mlevel2, *mlevel3;
7537 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007538 int kind;
7539 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007540 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007541 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007542
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007543 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007544 PyErr_BadArgument();
7545 return NULL;
7546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007547 kind = PyUnicode_KIND(string);
7548 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007549 length = PyUnicode_GET_LENGTH(string);
7550 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007551 memset(level1, 0xFF, sizeof level1);
7552 memset(level2, 0xFF, sizeof level2);
7553
7554 /* If there isn't a one-to-one mapping of NULL to \0,
7555 or if there are non-BMP characters, we need to use
7556 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007557 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007558 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007559 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007560 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007561 ch = PyUnicode_READ(kind, data, i);
7562 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007563 need_dict = 1;
7564 break;
7565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007566 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007567 /* unmapped character */
7568 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007569 l1 = ch >> 11;
7570 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007571 if (level1[l1] == 0xFF)
7572 level1[l1] = count2++;
7573 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007574 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007575 }
7576
7577 if (count2 >= 0xFF || count3 >= 0xFF)
7578 need_dict = 1;
7579
7580 if (need_dict) {
7581 PyObject *result = PyDict_New();
7582 PyObject *key, *value;
7583 if (!result)
7584 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007585 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007586 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007587 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007588 if (!key || !value)
7589 goto failed1;
7590 if (PyDict_SetItem(result, key, value) == -1)
7591 goto failed1;
7592 Py_DECREF(key);
7593 Py_DECREF(value);
7594 }
7595 return result;
7596 failed1:
7597 Py_XDECREF(key);
7598 Py_XDECREF(value);
7599 Py_DECREF(result);
7600 return NULL;
7601 }
7602
7603 /* Create a three-level trie */
7604 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7605 16*count2 + 128*count3 - 1);
7606 if (!result)
7607 return PyErr_NoMemory();
7608 PyObject_Init(result, &EncodingMapType);
7609 mresult = (struct encoding_map*)result;
7610 mresult->count2 = count2;
7611 mresult->count3 = count3;
7612 mlevel1 = mresult->level1;
7613 mlevel2 = mresult->level23;
7614 mlevel3 = mresult->level23 + 16*count2;
7615 memcpy(mlevel1, level1, 32);
7616 memset(mlevel2, 0xFF, 16*count2);
7617 memset(mlevel3, 0, 128*count3);
7618 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007619 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007620 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007621 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7622 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007623 /* unmapped character */
7624 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007625 o1 = ch>>11;
7626 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007627 i2 = 16*mlevel1[o1] + o2;
7628 if (mlevel2[i2] == 0xFF)
7629 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007630 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007631 i3 = 128*mlevel2[i2] + o3;
7632 mlevel3[i3] = i;
7633 }
7634 return result;
7635}
7636
7637static int
Victor Stinner22168992011-11-20 17:09:18 +01007638encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007639{
7640 struct encoding_map *map = (struct encoding_map*)mapping;
7641 int l1 = c>>11;
7642 int l2 = (c>>7) & 0xF;
7643 int l3 = c & 0x7F;
7644 int i;
7645
Victor Stinner22168992011-11-20 17:09:18 +01007646 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007648 if (c == 0)
7649 return 0;
7650 /* level 1*/
7651 i = map->level1[l1];
7652 if (i == 0xFF) {
7653 return -1;
7654 }
7655 /* level 2*/
7656 i = map->level23[16*i+l2];
7657 if (i == 0xFF) {
7658 return -1;
7659 }
7660 /* level 3 */
7661 i = map->level23[16*map->count2 + 128*i + l3];
7662 if (i == 0) {
7663 return -1;
7664 }
7665 return i;
7666}
7667
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007668/* Lookup the character ch in the mapping. If the character
7669 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007670 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007671static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007672charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673{
Christian Heimes217cfd12007-12-02 14:31:20 +00007674 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007675 PyObject *x;
7676
7677 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007679 x = PyObject_GetItem(mapping, w);
7680 Py_DECREF(w);
7681 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7683 /* No mapping found means: mapping is undefined. */
7684 PyErr_Clear();
7685 x = Py_None;
7686 Py_INCREF(x);
7687 return x;
7688 } else
7689 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007691 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007693 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 long value = PyLong_AS_LONG(x);
7695 if (value < 0 || value > 255) {
7696 PyErr_SetString(PyExc_TypeError,
7697 "character mapping must be in range(256)");
7698 Py_DECREF(x);
7699 return NULL;
7700 }
7701 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007703 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 /* wrong return value */
7707 PyErr_Format(PyExc_TypeError,
7708 "character mapping must return integer, bytes or None, not %.400s",
7709 x->ob_type->tp_name);
7710 Py_DECREF(x);
7711 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712 }
7713}
7714
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007715static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007716charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007717{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007718 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7719 /* exponentially overallocate to minimize reallocations */
7720 if (requiredsize < 2*outsize)
7721 requiredsize = 2*outsize;
7722 if (_PyBytes_Resize(outobj, requiredsize))
7723 return -1;
7724 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007725}
7726
Benjamin Peterson14339b62009-01-31 16:36:08 +00007727typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007728 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007729} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007730/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007731 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007732 space is available. Return a new reference to the object that
7733 was put in the output buffer, or Py_None, if the mapping was undefined
7734 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007735 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007736static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007737charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007738 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007739{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007740 PyObject *rep;
7741 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007742 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007743
Christian Heimes90aa7642007-12-19 02:45:37 +00007744 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007745 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007747 if (res == -1)
7748 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 if (outsize<requiredsize)
7750 if (charmapencode_resize(outobj, outpos, requiredsize))
7751 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007752 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 outstart[(*outpos)++] = (char)res;
7754 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007755 }
7756
7757 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007758 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007760 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 Py_DECREF(rep);
7762 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007763 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 if (PyLong_Check(rep)) {
7765 Py_ssize_t requiredsize = *outpos+1;
7766 if (outsize<requiredsize)
7767 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7768 Py_DECREF(rep);
7769 return enc_EXCEPTION;
7770 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007771 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007773 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 else {
7775 const char *repchars = PyBytes_AS_STRING(rep);
7776 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7777 Py_ssize_t requiredsize = *outpos+repsize;
7778 if (outsize<requiredsize)
7779 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7780 Py_DECREF(rep);
7781 return enc_EXCEPTION;
7782 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007783 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 memcpy(outstart + *outpos, repchars, repsize);
7785 *outpos += repsize;
7786 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007787 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007788 Py_DECREF(rep);
7789 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007790}
7791
7792/* handle an error in PyUnicode_EncodeCharmap
7793 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007794static int
7795charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007796 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007797 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007798 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007799 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007800{
7801 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007802 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007803 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007804 enum PyUnicode_Kind kind;
7805 void *data;
7806 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007807 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007808 Py_ssize_t collstartpos = *inpos;
7809 Py_ssize_t collendpos = *inpos+1;
7810 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007811 char *encoding = "charmap";
7812 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007813 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007814 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007815 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007816
Benjamin Petersonbac79492012-01-14 13:34:47 -05007817 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007818 return -1;
7819 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007820 /* find all unencodable characters */
7821 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007822 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007823 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007824 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007825 val = encoding_map_lookup(ch, mapping);
7826 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 break;
7828 ++collendpos;
7829 continue;
7830 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007831
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007832 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7833 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 if (rep==NULL)
7835 return -1;
7836 else if (rep!=Py_None) {
7837 Py_DECREF(rep);
7838 break;
7839 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007840 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007842 }
7843 /* cache callback name lookup
7844 * (if not done yet, i.e. it's the first error) */
7845 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 if ((errors==NULL) || (!strcmp(errors, "strict")))
7847 *known_errorHandler = 1;
7848 else if (!strcmp(errors, "replace"))
7849 *known_errorHandler = 2;
7850 else if (!strcmp(errors, "ignore"))
7851 *known_errorHandler = 3;
7852 else if (!strcmp(errors, "xmlcharrefreplace"))
7853 *known_errorHandler = 4;
7854 else
7855 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007856 }
7857 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007858 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007859 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007860 return -1;
7861 case 2: /* replace */
7862 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 x = charmapencode_output('?', mapping, res, respos);
7864 if (x==enc_EXCEPTION) {
7865 return -1;
7866 }
7867 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007868 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 return -1;
7870 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007871 }
7872 /* fall through */
7873 case 3: /* ignore */
7874 *inpos = collendpos;
7875 break;
7876 case 4: /* xmlcharrefreplace */
7877 /* generate replacement (temporarily (mis)uses p) */
7878 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007879 char buffer[2+29+1+1];
7880 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007881 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 for (cp = buffer; *cp; ++cp) {
7883 x = charmapencode_output(*cp, mapping, res, respos);
7884 if (x==enc_EXCEPTION)
7885 return -1;
7886 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007887 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 return -1;
7889 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007890 }
7891 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007892 *inpos = collendpos;
7893 break;
7894 default:
7895 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007896 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007898 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007900 if (PyBytes_Check(repunicode)) {
7901 /* Directly copy bytes result to output. */
7902 Py_ssize_t outsize = PyBytes_Size(*res);
7903 Py_ssize_t requiredsize;
7904 repsize = PyBytes_Size(repunicode);
7905 requiredsize = *respos + repsize;
7906 if (requiredsize > outsize)
7907 /* Make room for all additional bytes. */
7908 if (charmapencode_resize(res, respos, requiredsize)) {
7909 Py_DECREF(repunicode);
7910 return -1;
7911 }
7912 memcpy(PyBytes_AsString(*res) + *respos,
7913 PyBytes_AsString(repunicode), repsize);
7914 *respos += repsize;
7915 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007916 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007917 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007918 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007919 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007920 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007921 Py_DECREF(repunicode);
7922 return -1;
7923 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007924 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007925 data = PyUnicode_DATA(repunicode);
7926 kind = PyUnicode_KIND(repunicode);
7927 for (index = 0; index < repsize; index++) {
7928 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7929 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007931 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 return -1;
7933 }
7934 else if (x==enc_FAILED) {
7935 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007936 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 return -1;
7938 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007939 }
7940 *inpos = newpos;
7941 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007942 }
7943 return 0;
7944}
7945
Alexander Belopolsky40018472011-02-26 01:02:56 +00007946PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007947_PyUnicode_EncodeCharmap(PyObject *unicode,
7948 PyObject *mapping,
7949 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007951 /* output object */
7952 PyObject *res = NULL;
7953 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007954 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007955 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007956 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007957 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007958 PyObject *errorHandler = NULL;
7959 PyObject *exc = NULL;
7960 /* the following variable is used for caching string comparisons
7961 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7962 * 3=ignore, 4=xmlcharrefreplace */
7963 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964
Benjamin Petersonbac79492012-01-14 13:34:47 -05007965 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007966 return NULL;
7967 size = PyUnicode_GET_LENGTH(unicode);
7968
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 /* Default to Latin-1 */
7970 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007971 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007973 /* allocate enough for a simple encoding without
7974 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007975 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976 if (res == NULL)
7977 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007978 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007981 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007982 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007984 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 if (x==enc_EXCEPTION) /* error */
7986 goto onError;
7987 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007988 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 &exc,
7990 &known_errorHandler, &errorHandler, errors,
7991 &res, &respos)) {
7992 goto onError;
7993 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007994 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 else
7996 /* done with this character => adjust input position */
7997 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008000 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008001 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008002 if (_PyBytes_Resize(&res, respos) < 0)
8003 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008005 Py_XDECREF(exc);
8006 Py_XDECREF(errorHandler);
8007 return res;
8008
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008010 Py_XDECREF(res);
8011 Py_XDECREF(exc);
8012 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 return NULL;
8014}
8015
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008016/* Deprecated */
8017PyObject *
8018PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8019 Py_ssize_t size,
8020 PyObject *mapping,
8021 const char *errors)
8022{
8023 PyObject *result;
8024 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8025 if (unicode == NULL)
8026 return NULL;
8027 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8028 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008029 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008030}
8031
Alexander Belopolsky40018472011-02-26 01:02:56 +00008032PyObject *
8033PyUnicode_AsCharmapString(PyObject *unicode,
8034 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035{
8036 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 PyErr_BadArgument();
8038 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008040 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041}
8042
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008043/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008044static void
8045make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008046 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008047 Py_ssize_t startpos, Py_ssize_t endpos,
8048 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008050 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008051 *exceptionObject = _PyUnicodeTranslateError_Create(
8052 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053 }
8054 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8056 goto onError;
8057 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8058 goto onError;
8059 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8060 goto onError;
8061 return;
8062 onError:
8063 Py_DECREF(*exceptionObject);
8064 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 }
8066}
8067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008068/* error handling callback helper:
8069 build arguments, call the callback and check the arguments,
8070 put the result into newpos and return the replacement string, which
8071 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008072static PyObject *
8073unicode_translate_call_errorhandler(const char *errors,
8074 PyObject **errorHandler,
8075 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008076 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008077 Py_ssize_t startpos, Py_ssize_t endpos,
8078 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008079{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008080 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008081
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008082 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008083 PyObject *restuple;
8084 PyObject *resunicode;
8085
8086 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008090 }
8091
8092 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008093 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008094 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008096
8097 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008101 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008102 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 Py_DECREF(restuple);
8104 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008105 }
8106 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 &resunicode, &i_newpos)) {
8108 Py_DECREF(restuple);
8109 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008111 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008113 else
8114 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008115 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008116 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8117 Py_DECREF(restuple);
8118 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008119 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008120 Py_INCREF(resunicode);
8121 Py_DECREF(restuple);
8122 return resunicode;
8123}
8124
8125/* Lookup the character ch in the mapping and put the result in result,
8126 which must be decrefed by the caller.
8127 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008128static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008129charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008130{
Christian Heimes217cfd12007-12-02 14:31:20 +00008131 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008132 PyObject *x;
8133
8134 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008136 x = PyObject_GetItem(mapping, w);
8137 Py_DECREF(w);
8138 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8140 /* No mapping found means: use 1:1 mapping. */
8141 PyErr_Clear();
8142 *result = NULL;
8143 return 0;
8144 } else
8145 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 }
8147 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 *result = x;
8149 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008150 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008151 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 long value = PyLong_AS_LONG(x);
8153 long max = PyUnicode_GetMax();
8154 if (value < 0 || value > max) {
8155 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008156 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 Py_DECREF(x);
8158 return -1;
8159 }
8160 *result = x;
8161 return 0;
8162 }
8163 else if (PyUnicode_Check(x)) {
8164 *result = x;
8165 return 0;
8166 }
8167 else {
8168 /* wrong return value */
8169 PyErr_SetString(PyExc_TypeError,
8170 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008171 Py_DECREF(x);
8172 return -1;
8173 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008174}
8175/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 if not reallocate and adjust various state variables.
8177 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008178static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008179charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008181{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008182 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008183 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008184 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 /* exponentially overallocate to minimize reallocations */
8186 if (requiredsize < 2 * oldsize)
8187 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008188 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8189 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008191 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008193 }
8194 return 0;
8195}
8196/* lookup the character, put the result in the output string and adjust
8197 various state variables. Return a new reference to the object that
8198 was put in the output buffer in *result, or Py_None, if the mapping was
8199 undefined (in which case no character was written).
8200 The called must decref result.
8201 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008202static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008203charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8204 PyObject *mapping, Py_UCS4 **output,
8205 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008206 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008207{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008208 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8209 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008211 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008213 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008214 }
8215 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008217 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008219 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008220 }
8221 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008222 Py_ssize_t repsize;
8223 if (PyUnicode_READY(*res) == -1)
8224 return -1;
8225 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 if (repsize==1) {
8227 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008228 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008229 }
8230 else if (repsize!=0) {
8231 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008232 Py_ssize_t requiredsize = *opos +
8233 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 Py_ssize_t i;
8236 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238 for(i = 0; i < repsize; i++)
8239 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241 }
8242 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008244 return 0;
8245}
8246
Alexander Belopolsky40018472011-02-26 01:02:56 +00008247PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248_PyUnicode_TranslateCharmap(PyObject *input,
8249 PyObject *mapping,
8250 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 /* input object */
8253 char *idata;
8254 Py_ssize_t size, i;
8255 int kind;
8256 /* output buffer */
8257 Py_UCS4 *output = NULL;
8258 Py_ssize_t osize;
8259 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008261 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 char *reason = "character maps to <undefined>";
8263 PyObject *errorHandler = NULL;
8264 PyObject *exc = NULL;
8265 /* the following variable is used for caching string comparisons
8266 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8267 * 3=ignore, 4=xmlcharrefreplace */
8268 int known_errorHandler = -1;
8269
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 PyErr_BadArgument();
8272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008275 if (PyUnicode_READY(input) == -1)
8276 return NULL;
8277 idata = (char*)PyUnicode_DATA(input);
8278 kind = PyUnicode_KIND(input);
8279 size = PyUnicode_GET_LENGTH(input);
8280 i = 0;
8281
8282 if (size == 0) {
8283 Py_INCREF(input);
8284 return input;
8285 }
8286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008287 /* allocate enough for a simple 1:1 translation without
8288 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008289 osize = size;
8290 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8291 opos = 0;
8292 if (output == NULL) {
8293 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008297 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 /* try to encode it */
8299 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008300 if (charmaptranslate_output(input, i, mapping,
8301 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 Py_XDECREF(x);
8303 goto onError;
8304 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008305 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008307 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 else { /* untranslatable character */
8309 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8310 Py_ssize_t repsize;
8311 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008314 Py_ssize_t collstart = i;
8315 Py_ssize_t collend = i+1;
8316 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008319 while (collend < size) {
8320 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 goto onError;
8322 Py_XDECREF(x);
8323 if (x!=Py_None)
8324 break;
8325 ++collend;
8326 }
8327 /* cache callback name lookup
8328 * (if not done yet, i.e. it's the first error) */
8329 if (known_errorHandler==-1) {
8330 if ((errors==NULL) || (!strcmp(errors, "strict")))
8331 known_errorHandler = 1;
8332 else if (!strcmp(errors, "replace"))
8333 known_errorHandler = 2;
8334 else if (!strcmp(errors, "ignore"))
8335 known_errorHandler = 3;
8336 else if (!strcmp(errors, "xmlcharrefreplace"))
8337 known_errorHandler = 4;
8338 else
8339 known_errorHandler = 0;
8340 }
8341 switch (known_errorHandler) {
8342 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008343 make_translate_exception(&exc,
8344 input, collstart, collend, reason);
8345 if (exc != NULL)
8346 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008347 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 case 2: /* replace */
8349 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008350 for (coll = collstart; coll<collend; coll++)
8351 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 /* fall through */
8353 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008354 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 break;
8356 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008357 /* generate replacement (temporarily (mis)uses i) */
8358 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 char buffer[2+29+1+1];
8360 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8362 if (charmaptranslate_makespace(&output, &osize,
8363 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 goto onError;
8365 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 break;
8370 default:
8371 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 reason, input, &exc,
8373 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008374 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008376 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008377 Py_DECREF(repunicode);
8378 goto onError;
8379 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 repsize = PyUnicode_GET_LENGTH(repunicode);
8382 if (charmaptranslate_makespace(&output, &osize,
8383 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 Py_DECREF(repunicode);
8385 goto onError;
8386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 for (uni2 = 0; repsize-->0; ++uni2)
8388 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8389 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008391 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008392 }
8393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8395 if (!res)
8396 goto onError;
8397 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008398 Py_XDECREF(exc);
8399 Py_XDECREF(errorHandler);
8400 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008403 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 Py_XDECREF(exc);
8405 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406 return NULL;
8407}
8408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008409/* Deprecated. Use PyUnicode_Translate instead. */
8410PyObject *
8411PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8412 Py_ssize_t size,
8413 PyObject *mapping,
8414 const char *errors)
8415{
Christian Heimes5f520f42012-09-11 14:03:25 +02008416 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8418 if (!unicode)
8419 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008420 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8421 Py_DECREF(unicode);
8422 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008423}
8424
Alexander Belopolsky40018472011-02-26 01:02:56 +00008425PyObject *
8426PyUnicode_Translate(PyObject *str,
8427 PyObject *mapping,
8428 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429{
8430 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008431
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 str = PyUnicode_FromObject(str);
8433 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008434 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436 Py_DECREF(str);
8437 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438}
Tim Petersced69f82003-09-16 20:30:58 +00008439
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008441fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008442{
8443 /* No need to call PyUnicode_READY(self) because this function is only
8444 called as a callback from fixup() which does it already. */
8445 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8446 const int kind = PyUnicode_KIND(self);
8447 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008448 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008449 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450 Py_ssize_t i;
8451
8452 for (i = 0; i < len; ++i) {
8453 ch = PyUnicode_READ(kind, data, i);
8454 fixed = 0;
8455 if (ch > 127) {
8456 if (Py_UNICODE_ISSPACE(ch))
8457 fixed = ' ';
8458 else {
8459 const int decimal = Py_UNICODE_TODECIMAL(ch);
8460 if (decimal >= 0)
8461 fixed = '0' + decimal;
8462 }
8463 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008464 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008465 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 PyUnicode_WRITE(kind, data, i, fixed);
8467 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008468 else
8469 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 }
8472
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008473 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474}
8475
8476PyObject *
8477_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8478{
8479 if (!PyUnicode_Check(unicode)) {
8480 PyErr_BadInternalCall();
8481 return NULL;
8482 }
8483 if (PyUnicode_READY(unicode) == -1)
8484 return NULL;
8485 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8486 /* If the string is already ASCII, just return the same string */
8487 Py_INCREF(unicode);
8488 return unicode;
8489 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008490 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491}
8492
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008493PyObject *
8494PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8495 Py_ssize_t length)
8496{
Victor Stinnerf0124502011-11-21 23:12:56 +01008497 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008498 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008499 Py_UCS4 maxchar;
8500 enum PyUnicode_Kind kind;
8501 void *data;
8502
Victor Stinner99d7ad02012-02-22 13:37:39 +01008503 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008504 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008505 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008506 if (ch > 127) {
8507 int decimal = Py_UNICODE_TODECIMAL(ch);
8508 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008509 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008510 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008511 }
8512 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008513
8514 /* Copy to a new string */
8515 decimal = PyUnicode_New(length, maxchar);
8516 if (decimal == NULL)
8517 return decimal;
8518 kind = PyUnicode_KIND(decimal);
8519 data = PyUnicode_DATA(decimal);
8520 /* Iterate over code points */
8521 for (i = 0; i < length; i++) {
8522 Py_UNICODE ch = s[i];
8523 if (ch > 127) {
8524 int decimal = Py_UNICODE_TODECIMAL(ch);
8525 if (decimal >= 0)
8526 ch = '0' + decimal;
8527 }
8528 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008530 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008531}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008532/* --- Decimal Encoder ---------------------------------------------------- */
8533
Alexander Belopolsky40018472011-02-26 01:02:56 +00008534int
8535PyUnicode_EncodeDecimal(Py_UNICODE *s,
8536 Py_ssize_t length,
8537 char *output,
8538 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008539{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008540 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008541 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008542 enum PyUnicode_Kind kind;
8543 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008544
8545 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 PyErr_BadArgument();
8547 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008548 }
8549
Victor Stinner42bf7752011-11-21 22:52:58 +01008550 unicode = PyUnicode_FromUnicode(s, length);
8551 if (unicode == NULL)
8552 return -1;
8553
Benjamin Petersonbac79492012-01-14 13:34:47 -05008554 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008555 Py_DECREF(unicode);
8556 return -1;
8557 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008558 kind = PyUnicode_KIND(unicode);
8559 data = PyUnicode_DATA(unicode);
8560
Victor Stinnerb84d7232011-11-22 01:50:07 +01008561 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008562 PyObject *exc;
8563 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008565 Py_ssize_t startpos;
8566
8567 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008568
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008570 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008571 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008573 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 decimal = Py_UNICODE_TODECIMAL(ch);
8575 if (decimal >= 0) {
8576 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008577 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 continue;
8579 }
8580 if (0 < ch && ch < 256) {
8581 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008582 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 continue;
8584 }
Victor Stinner6345be92011-11-25 20:09:01 +01008585
Victor Stinner42bf7752011-11-21 22:52:58 +01008586 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008587 exc = NULL;
8588 raise_encode_exception(&exc, "decimal", unicode,
8589 startpos, startpos+1,
8590 "invalid decimal Unicode string");
8591 Py_XDECREF(exc);
8592 Py_DECREF(unicode);
8593 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008594 }
8595 /* 0-terminate the output string */
8596 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008597 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008598 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008599}
8600
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601/* --- Helpers ------------------------------------------------------------ */
8602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008604any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008605 Py_ssize_t start,
8606 Py_ssize_t end)
8607{
8608 int kind1, kind2, kind;
8609 void *buf1, *buf2;
8610 Py_ssize_t len1, len2, result;
8611
8612 kind1 = PyUnicode_KIND(s1);
8613 kind2 = PyUnicode_KIND(s2);
8614 kind = kind1 > kind2 ? kind1 : kind2;
8615 buf1 = PyUnicode_DATA(s1);
8616 buf2 = PyUnicode_DATA(s2);
8617 if (kind1 != kind)
8618 buf1 = _PyUnicode_AsKind(s1, kind);
8619 if (!buf1)
8620 return -2;
8621 if (kind2 != kind)
8622 buf2 = _PyUnicode_AsKind(s2, kind);
8623 if (!buf2) {
8624 if (kind1 != kind) PyMem_Free(buf1);
8625 return -2;
8626 }
8627 len1 = PyUnicode_GET_LENGTH(s1);
8628 len2 = PyUnicode_GET_LENGTH(s2);
8629
Victor Stinner794d5672011-10-10 03:21:36 +02008630 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008631 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008632 case PyUnicode_1BYTE_KIND:
8633 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8634 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8635 else
8636 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8637 break;
8638 case PyUnicode_2BYTE_KIND:
8639 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8640 break;
8641 case PyUnicode_4BYTE_KIND:
8642 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8643 break;
8644 default:
8645 assert(0); result = -2;
8646 }
8647 }
8648 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008649 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008650 case PyUnicode_1BYTE_KIND:
8651 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8652 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8653 else
8654 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8655 break;
8656 case PyUnicode_2BYTE_KIND:
8657 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8658 break;
8659 case PyUnicode_4BYTE_KIND:
8660 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8661 break;
8662 default:
8663 assert(0); result = -2;
8664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 }
8666
8667 if (kind1 != kind)
8668 PyMem_Free(buf1);
8669 if (kind2 != kind)
8670 PyMem_Free(buf2);
8671
8672 return result;
8673}
8674
8675Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008676_PyUnicode_InsertThousandsGrouping(
8677 PyObject *unicode, Py_ssize_t index,
8678 Py_ssize_t n_buffer,
8679 void *digits, Py_ssize_t n_digits,
8680 Py_ssize_t min_width,
8681 const char *grouping, PyObject *thousands_sep,
8682 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008683{
Victor Stinner41a863c2012-02-24 00:37:51 +01008684 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008685 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008686 Py_ssize_t thousands_sep_len;
8687 Py_ssize_t len;
8688
8689 if (unicode != NULL) {
8690 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008691 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008692 }
8693 else {
8694 kind = PyUnicode_1BYTE_KIND;
8695 data = NULL;
8696 }
8697 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8698 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8699 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8700 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008701 if (thousands_sep_kind < kind) {
8702 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8703 if (!thousands_sep_data)
8704 return -1;
8705 }
8706 else {
8707 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8708 if (!data)
8709 return -1;
8710 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008711 }
8712
Benjamin Petersonead6b532011-12-20 17:23:42 -06008713 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008715 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008716 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008717 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008718 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008719 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008720 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008721 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008722 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008723 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008724 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008725 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008727 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008728 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008729 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008730 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008731 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008732 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008733 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008734 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008735 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008736 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008737 break;
8738 default:
8739 assert(0);
8740 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008742 if (unicode != NULL && thousands_sep_kind != kind) {
8743 if (thousands_sep_kind < kind)
8744 PyMem_Free(thousands_sep_data);
8745 else
8746 PyMem_Free(data);
8747 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008748 if (unicode == NULL) {
8749 *maxchar = 127;
8750 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008751 *maxchar = MAX_MAXCHAR(*maxchar,
8752 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008753 }
8754 }
8755 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756}
8757
8758
Thomas Wouters477c8d52006-05-27 19:21:47 +00008759/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008760#define ADJUST_INDICES(start, end, len) \
8761 if (end > len) \
8762 end = len; \
8763 else if (end < 0) { \
8764 end += len; \
8765 if (end < 0) \
8766 end = 0; \
8767 } \
8768 if (start < 0) { \
8769 start += len; \
8770 if (start < 0) \
8771 start = 0; \
8772 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008773
Alexander Belopolsky40018472011-02-26 01:02:56 +00008774Py_ssize_t
8775PyUnicode_Count(PyObject *str,
8776 PyObject *substr,
8777 Py_ssize_t start,
8778 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008780 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008781 PyObject* str_obj;
8782 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 int kind1, kind2, kind;
8784 void *buf1 = NULL, *buf2 = NULL;
8785 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008786
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008787 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008788 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008790 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008791 if (!sub_obj) {
8792 Py_DECREF(str_obj);
8793 return -1;
8794 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008795 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008796 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 Py_DECREF(str_obj);
8798 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799 }
Tim Petersced69f82003-09-16 20:30:58 +00008800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008801 kind1 = PyUnicode_KIND(str_obj);
8802 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008803 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008806 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008807 if (kind2 > kind) {
8808 Py_DECREF(sub_obj);
8809 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008810 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008811 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008812 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814 if (!buf2)
8815 goto onError;
8816 len1 = PyUnicode_GET_LENGTH(str_obj);
8817 len2 = PyUnicode_GET_LENGTH(sub_obj);
8818
8819 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008820 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008822 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8823 result = asciilib_count(
8824 ((Py_UCS1*)buf1) + start, end - start,
8825 buf2, len2, PY_SSIZE_T_MAX
8826 );
8827 else
8828 result = ucs1lib_count(
8829 ((Py_UCS1*)buf1) + start, end - start,
8830 buf2, len2, PY_SSIZE_T_MAX
8831 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 break;
8833 case PyUnicode_2BYTE_KIND:
8834 result = ucs2lib_count(
8835 ((Py_UCS2*)buf1) + start, end - start,
8836 buf2, len2, PY_SSIZE_T_MAX
8837 );
8838 break;
8839 case PyUnicode_4BYTE_KIND:
8840 result = ucs4lib_count(
8841 ((Py_UCS4*)buf1) + start, end - start,
8842 buf2, len2, PY_SSIZE_T_MAX
8843 );
8844 break;
8845 default:
8846 assert(0); result = 0;
8847 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008848
8849 Py_DECREF(sub_obj);
8850 Py_DECREF(str_obj);
8851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 if (kind2 != kind)
8853 PyMem_Free(buf2);
8854
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 onError:
8857 Py_DECREF(sub_obj);
8858 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008859 if (kind2 != kind && buf2)
8860 PyMem_Free(buf2);
8861 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862}
8863
Alexander Belopolsky40018472011-02-26 01:02:56 +00008864Py_ssize_t
8865PyUnicode_Find(PyObject *str,
8866 PyObject *sub,
8867 Py_ssize_t start,
8868 Py_ssize_t end,
8869 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008871 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008872
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008874 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008876 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008877 if (!sub) {
8878 Py_DECREF(str);
8879 return -2;
8880 }
8881 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8882 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 Py_DECREF(str);
8884 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885 }
Tim Petersced69f82003-09-16 20:30:58 +00008886
Victor Stinner794d5672011-10-10 03:21:36 +02008887 result = any_find_slice(direction,
8888 str, sub, start, end
8889 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008890
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008892 Py_DECREF(sub);
8893
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894 return result;
8895}
8896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008897Py_ssize_t
8898PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8899 Py_ssize_t start, Py_ssize_t end,
8900 int direction)
8901{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008903 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904 if (PyUnicode_READY(str) == -1)
8905 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008906 if (start < 0 || end < 0) {
8907 PyErr_SetString(PyExc_IndexError, "string index out of range");
8908 return -2;
8909 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910 if (end > PyUnicode_GET_LENGTH(str))
8911 end = PyUnicode_GET_LENGTH(str);
8912 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008913 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8914 kind, end-start, ch, direction);
8915 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008917 else
8918 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919}
8920
Alexander Belopolsky40018472011-02-26 01:02:56 +00008921static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008922tailmatch(PyObject *self,
8923 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008924 Py_ssize_t start,
8925 Py_ssize_t end,
8926 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 int kind_self;
8929 int kind_sub;
8930 void *data_self;
8931 void *data_sub;
8932 Py_ssize_t offset;
8933 Py_ssize_t i;
8934 Py_ssize_t end_sub;
8935
8936 if (PyUnicode_READY(self) == -1 ||
8937 PyUnicode_READY(substring) == -1)
8938 return 0;
8939
8940 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941 return 1;
8942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8944 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 kind_self = PyUnicode_KIND(self);
8949 data_self = PyUnicode_DATA(self);
8950 kind_sub = PyUnicode_KIND(substring);
8951 data_sub = PyUnicode_DATA(substring);
8952 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8953
8954 if (direction > 0)
8955 offset = end;
8956 else
8957 offset = start;
8958
8959 if (PyUnicode_READ(kind_self, data_self, offset) ==
8960 PyUnicode_READ(kind_sub, data_sub, 0) &&
8961 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8962 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8963 /* If both are of the same kind, memcmp is sufficient */
8964 if (kind_self == kind_sub) {
8965 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008966 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 data_sub,
8968 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008969 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 }
8971 /* otherwise we have to compare each character by first accesing it */
8972 else {
8973 /* We do not need to compare 0 and len(substring)-1 because
8974 the if statement above ensured already that they are equal
8975 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02008976 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 for (i = 1; i < end_sub; ++i) {
8978 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8979 PyUnicode_READ(kind_sub, data_sub, i))
8980 return 0;
8981 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984 }
8985
8986 return 0;
8987}
8988
Alexander Belopolsky40018472011-02-26 01:02:56 +00008989Py_ssize_t
8990PyUnicode_Tailmatch(PyObject *str,
8991 PyObject *substr,
8992 Py_ssize_t start,
8993 Py_ssize_t end,
8994 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008996 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008997
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998 str = PyUnicode_FromObject(str);
8999 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001 substr = PyUnicode_FromObject(substr);
9002 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 Py_DECREF(str);
9004 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 }
Tim Petersced69f82003-09-16 20:30:58 +00009006
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009007 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009008 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009 Py_DECREF(str);
9010 Py_DECREF(substr);
9011 return result;
9012}
9013
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014/* Apply fixfct filter to the Unicode object self and return a
9015 reference to the modified object */
9016
Alexander Belopolsky40018472011-02-26 01:02:56 +00009017static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009018fixup(PyObject *self,
9019 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021 PyObject *u;
9022 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009023 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009025 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009027 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009028 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 /* fix functions return the new maximum character in a string,
9031 if the kind of the resulting unicode object does not change,
9032 everything is fine. Otherwise we need to change the string kind
9033 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009034 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009035
9036 if (maxchar_new == 0) {
9037 /* no changes */;
9038 if (PyUnicode_CheckExact(self)) {
9039 Py_DECREF(u);
9040 Py_INCREF(self);
9041 return self;
9042 }
9043 else
9044 return u;
9045 }
9046
Victor Stinnere6abb482012-05-02 01:15:40 +02009047 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048
Victor Stinnereaab6042011-12-11 22:22:39 +01009049 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009051
9052 /* In case the maximum character changed, we need to
9053 convert the string to the new category. */
9054 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9055 if (v == NULL) {
9056 Py_DECREF(u);
9057 return NULL;
9058 }
9059 if (maxchar_new > maxchar_old) {
9060 /* If the maxchar increased so that the kind changed, not all
9061 characters are representable anymore and we need to fix the
9062 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009063 _PyUnicode_FastCopyCharacters(v, 0,
9064 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009065 maxchar_old = fixfct(v);
9066 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067 }
9068 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009069 _PyUnicode_FastCopyCharacters(v, 0,
9070 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009072 Py_DECREF(u);
9073 assert(_PyUnicode_CheckConsistency(v, 1));
9074 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075}
9076
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009077static PyObject *
9078ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009080 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9081 char *resdata, *data = PyUnicode_DATA(self);
9082 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009083
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009084 res = PyUnicode_New(len, 127);
9085 if (res == NULL)
9086 return NULL;
9087 resdata = PyUnicode_DATA(res);
9088 if (lower)
9089 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009091 _Py_bytes_upper(resdata, data, len);
9092 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093}
9094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009096handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009098 Py_ssize_t j;
9099 int final_sigma;
9100 Py_UCS4 c;
9101 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009102
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009103 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9104
9105 where ! is a negation and \p{xxx} is a character with property xxx.
9106 */
9107 for (j = i - 1; j >= 0; j--) {
9108 c = PyUnicode_READ(kind, data, j);
9109 if (!_PyUnicode_IsCaseIgnorable(c))
9110 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009112 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9113 if (final_sigma) {
9114 for (j = i + 1; j < length; j++) {
9115 c = PyUnicode_READ(kind, data, j);
9116 if (!_PyUnicode_IsCaseIgnorable(c))
9117 break;
9118 }
9119 final_sigma = j == length || !_PyUnicode_IsCased(c);
9120 }
9121 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122}
9123
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009124static int
9125lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9126 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009128 /* Obscure special case. */
9129 if (c == 0x3A3) {
9130 mapped[0] = handle_capital_sigma(kind, data, length, i);
9131 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009133 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134}
9135
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009136static Py_ssize_t
9137do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009139 Py_ssize_t i, k = 0;
9140 int n_res, j;
9141 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009142
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009143 c = PyUnicode_READ(kind, data, 0);
9144 n_res = _PyUnicode_ToUpperFull(c, mapped);
9145 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009146 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009147 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009149 for (i = 1; i < length; i++) {
9150 c = PyUnicode_READ(kind, data, i);
9151 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9152 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009153 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009154 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009155 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009156 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009157 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158}
9159
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009160static Py_ssize_t
9161do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9162 Py_ssize_t i, k = 0;
9163
9164 for (i = 0; i < length; i++) {
9165 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9166 int n_res, j;
9167 if (Py_UNICODE_ISUPPER(c)) {
9168 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9169 }
9170 else if (Py_UNICODE_ISLOWER(c)) {
9171 n_res = _PyUnicode_ToUpperFull(c, mapped);
9172 }
9173 else {
9174 n_res = 1;
9175 mapped[0] = c;
9176 }
9177 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009178 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009179 res[k++] = mapped[j];
9180 }
9181 }
9182 return k;
9183}
9184
9185static Py_ssize_t
9186do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9187 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009188{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009189 Py_ssize_t i, k = 0;
9190
9191 for (i = 0; i < length; i++) {
9192 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9193 int n_res, j;
9194 if (lower)
9195 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9196 else
9197 n_res = _PyUnicode_ToUpperFull(c, mapped);
9198 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009199 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009200 res[k++] = mapped[j];
9201 }
9202 }
9203 return k;
9204}
9205
9206static Py_ssize_t
9207do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9208{
9209 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9210}
9211
9212static Py_ssize_t
9213do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9214{
9215 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9216}
9217
Benjamin Petersone51757f2012-01-12 21:10:29 -05009218static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009219do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9220{
9221 Py_ssize_t i, k = 0;
9222
9223 for (i = 0; i < length; i++) {
9224 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9225 Py_UCS4 mapped[3];
9226 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9227 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009228 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009229 res[k++] = mapped[j];
9230 }
9231 }
9232 return k;
9233}
9234
9235static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009236do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9237{
9238 Py_ssize_t i, k = 0;
9239 int previous_is_cased;
9240
9241 previous_is_cased = 0;
9242 for (i = 0; i < length; i++) {
9243 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9244 Py_UCS4 mapped[3];
9245 int n_res, j;
9246
9247 if (previous_is_cased)
9248 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9249 else
9250 n_res = _PyUnicode_ToTitleFull(c, mapped);
9251
9252 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009253 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009254 res[k++] = mapped[j];
9255 }
9256
9257 previous_is_cased = _PyUnicode_IsCased(c);
9258 }
9259 return k;
9260}
9261
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009262static PyObject *
9263case_operation(PyObject *self,
9264 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9265{
9266 PyObject *res = NULL;
9267 Py_ssize_t length, newlength = 0;
9268 int kind, outkind;
9269 void *data, *outdata;
9270 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9271
Benjamin Petersoneea48462012-01-16 14:28:50 -05009272 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009273
9274 kind = PyUnicode_KIND(self);
9275 data = PyUnicode_DATA(self);
9276 length = PyUnicode_GET_LENGTH(self);
9277 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9278 if (tmp == NULL)
9279 return PyErr_NoMemory();
9280 newlength = perform(kind, data, length, tmp, &maxchar);
9281 res = PyUnicode_New(newlength, maxchar);
9282 if (res == NULL)
9283 goto leave;
9284 tmpend = tmp + newlength;
9285 outdata = PyUnicode_DATA(res);
9286 outkind = PyUnicode_KIND(res);
9287 switch (outkind) {
9288 case PyUnicode_1BYTE_KIND:
9289 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9290 break;
9291 case PyUnicode_2BYTE_KIND:
9292 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9293 break;
9294 case PyUnicode_4BYTE_KIND:
9295 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9296 break;
9297 default:
9298 assert(0);
9299 break;
9300 }
9301 leave:
9302 PyMem_FREE(tmp);
9303 return res;
9304}
9305
Tim Peters8ce9f162004-08-27 01:49:32 +00009306PyObject *
9307PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009310 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009312 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009313 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9314 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009315 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009317 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009319 int use_memcpy;
9320 unsigned char *res_data = NULL, *sep_data = NULL;
9321 PyObject *last_obj;
9322 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009323
Tim Peters05eba1f2004-08-27 21:32:02 +00009324 fseq = PySequence_Fast(seq, "");
9325 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009326 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009327 }
9328
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009329 /* NOTE: the following code can't call back into Python code,
9330 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009331 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009332
Tim Peters05eba1f2004-08-27 21:32:02 +00009333 seqlen = PySequence_Fast_GET_SIZE(fseq);
9334 /* If empty sequence, return u"". */
9335 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009336 Py_DECREF(fseq);
9337 Py_INCREF(unicode_empty);
9338 res = unicode_empty;
9339 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009340 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009341
Tim Peters05eba1f2004-08-27 21:32:02 +00009342 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009343 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009344 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009345 if (seqlen == 1) {
9346 if (PyUnicode_CheckExact(items[0])) {
9347 res = items[0];
9348 Py_INCREF(res);
9349 Py_DECREF(fseq);
9350 return res;
9351 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009352 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009353 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009354 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009355 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009356 /* Set up sep and seplen */
9357 if (separator == NULL) {
9358 /* fall back to a blank space separator */
9359 sep = PyUnicode_FromOrdinal(' ');
9360 if (!sep)
9361 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009362 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009363 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009364 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009365 else {
9366 if (!PyUnicode_Check(separator)) {
9367 PyErr_Format(PyExc_TypeError,
9368 "separator: expected str instance,"
9369 " %.80s found",
9370 Py_TYPE(separator)->tp_name);
9371 goto onError;
9372 }
9373 if (PyUnicode_READY(separator))
9374 goto onError;
9375 sep = separator;
9376 seplen = PyUnicode_GET_LENGTH(separator);
9377 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9378 /* inc refcount to keep this code path symmetric with the
9379 above case of a blank separator */
9380 Py_INCREF(sep);
9381 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009382 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009383 }
9384
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009385 /* There are at least two things to join, or else we have a subclass
9386 * of str in the sequence.
9387 * Do a pre-pass to figure out the total amount of space we'll
9388 * need (sz), and see whether all argument are strings.
9389 */
9390 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009391#ifdef Py_DEBUG
9392 use_memcpy = 0;
9393#else
9394 use_memcpy = 1;
9395#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009396 for (i = 0; i < seqlen; i++) {
9397 const Py_ssize_t old_sz = sz;
9398 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009399 if (!PyUnicode_Check(item)) {
9400 PyErr_Format(PyExc_TypeError,
9401 "sequence item %zd: expected str instance,"
9402 " %.80s found",
9403 i, Py_TYPE(item)->tp_name);
9404 goto onError;
9405 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 if (PyUnicode_READY(item) == -1)
9407 goto onError;
9408 sz += PyUnicode_GET_LENGTH(item);
9409 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009410 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009411 if (i != 0)
9412 sz += seplen;
9413 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9414 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009415 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009416 goto onError;
9417 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009418 if (use_memcpy && last_obj != NULL) {
9419 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9420 use_memcpy = 0;
9421 }
9422 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009423 }
Tim Petersced69f82003-09-16 20:30:58 +00009424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009426 if (res == NULL)
9427 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009428
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009429 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009430#ifdef Py_DEBUG
9431 use_memcpy = 0;
9432#else
9433 if (use_memcpy) {
9434 res_data = PyUnicode_1BYTE_DATA(res);
9435 kind = PyUnicode_KIND(res);
9436 if (seplen != 0)
9437 sep_data = PyUnicode_1BYTE_DATA(sep);
9438 }
9439#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009441 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009442 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009443 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009444 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009445 if (use_memcpy) {
9446 Py_MEMCPY(res_data,
9447 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009448 kind * seplen);
9449 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009450 }
9451 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009452 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009453 res_offset += seplen;
9454 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009455 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009456 itemlen = PyUnicode_GET_LENGTH(item);
9457 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009458 if (use_memcpy) {
9459 Py_MEMCPY(res_data,
9460 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009461 kind * itemlen);
9462 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009463 }
9464 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009465 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009466 res_offset += itemlen;
9467 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009468 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009469 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009470 if (use_memcpy)
9471 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009472 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009473 else
9474 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009475
Tim Peters05eba1f2004-08-27 21:32:02 +00009476 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009478 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480
Benjamin Peterson29060642009-01-31 22:14:21 +00009481 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009482 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009484 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485 return NULL;
9486}
9487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488#define FILL(kind, data, value, start, length) \
9489 do { \
9490 Py_ssize_t i_ = 0; \
9491 assert(kind != PyUnicode_WCHAR_KIND); \
9492 switch ((kind)) { \
9493 case PyUnicode_1BYTE_KIND: { \
9494 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009495 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 break; \
9497 } \
9498 case PyUnicode_2BYTE_KIND: { \
9499 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9500 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9501 break; \
9502 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009503 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9505 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9506 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009507 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 } \
9509 } \
9510 } while (0)
9511
Victor Stinnerd3f08822012-05-29 12:57:52 +02009512void
9513_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9514 Py_UCS4 fill_char)
9515{
9516 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9517 const void *data = PyUnicode_DATA(unicode);
9518 assert(PyUnicode_IS_READY(unicode));
9519 assert(unicode_modifiable(unicode));
9520 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9521 assert(start >= 0);
9522 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9523 FILL(kind, data, fill_char, start, length);
9524}
9525
Victor Stinner3fe55312012-01-04 00:33:50 +01009526Py_ssize_t
9527PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9528 Py_UCS4 fill_char)
9529{
9530 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009531
9532 if (!PyUnicode_Check(unicode)) {
9533 PyErr_BadInternalCall();
9534 return -1;
9535 }
9536 if (PyUnicode_READY(unicode) == -1)
9537 return -1;
9538 if (unicode_check_modifiable(unicode))
9539 return -1;
9540
Victor Stinnerd3f08822012-05-29 12:57:52 +02009541 if (start < 0) {
9542 PyErr_SetString(PyExc_IndexError, "string index out of range");
9543 return -1;
9544 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009545 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9546 PyErr_SetString(PyExc_ValueError,
9547 "fill character is bigger than "
9548 "the string maximum character");
9549 return -1;
9550 }
9551
9552 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9553 length = Py_MIN(maxlen, length);
9554 if (length <= 0)
9555 return 0;
9556
Victor Stinnerd3f08822012-05-29 12:57:52 +02009557 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009558 return length;
9559}
9560
Victor Stinner9310abb2011-10-05 00:59:23 +02009561static PyObject *
9562pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009563 Py_ssize_t left,
9564 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 PyObject *u;
9568 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009569 int kind;
9570 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571
9572 if (left < 0)
9573 left = 0;
9574 if (right < 0)
9575 right = 0;
9576
Victor Stinnerc4b49542011-12-11 22:44:26 +01009577 if (left == 0 && right == 0)
9578 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9581 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009582 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9583 return NULL;
9584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009585 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009586 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009587 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009588 if (!u)
9589 return NULL;
9590
9591 kind = PyUnicode_KIND(u);
9592 data = PyUnicode_DATA(u);
9593 if (left)
9594 FILL(kind, data, fill, 0, left);
9595 if (right)
9596 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009597 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009598 assert(_PyUnicode_CheckConsistency(u, 1));
9599 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009600}
9601
Alexander Belopolsky40018472011-02-26 01:02:56 +00009602PyObject *
9603PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606
9607 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009608 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009609 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009610 if (PyUnicode_READY(string) == -1) {
9611 Py_DECREF(string);
9612 return NULL;
9613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009614
Benjamin Petersonead6b532011-12-20 17:23:42 -06009615 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009617 if (PyUnicode_IS_ASCII(string))
9618 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009619 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009620 PyUnicode_GET_LENGTH(string), keepends);
9621 else
9622 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009623 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009624 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 break;
9626 case PyUnicode_2BYTE_KIND:
9627 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009628 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 PyUnicode_GET_LENGTH(string), keepends);
9630 break;
9631 case PyUnicode_4BYTE_KIND:
9632 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009633 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 PyUnicode_GET_LENGTH(string), keepends);
9635 break;
9636 default:
9637 assert(0);
9638 list = 0;
9639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640 Py_DECREF(string);
9641 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642}
9643
Alexander Belopolsky40018472011-02-26 01:02:56 +00009644static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009645split(PyObject *self,
9646 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009647 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 int kind1, kind2, kind;
9650 void *buf1, *buf2;
9651 Py_ssize_t len1, len2;
9652 PyObject* out;
9653
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009655 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 if (PyUnicode_READY(self) == -1)
9658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009661 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009662 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009663 if (PyUnicode_IS_ASCII(self))
9664 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009665 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009666 PyUnicode_GET_LENGTH(self), maxcount
9667 );
9668 else
9669 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009670 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009671 PyUnicode_GET_LENGTH(self), maxcount
9672 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 case PyUnicode_2BYTE_KIND:
9674 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009675 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 PyUnicode_GET_LENGTH(self), maxcount
9677 );
9678 case PyUnicode_4BYTE_KIND:
9679 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009680 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009681 PyUnicode_GET_LENGTH(self), maxcount
9682 );
9683 default:
9684 assert(0);
9685 return NULL;
9686 }
9687
9688 if (PyUnicode_READY(substring) == -1)
9689 return NULL;
9690
9691 kind1 = PyUnicode_KIND(self);
9692 kind2 = PyUnicode_KIND(substring);
9693 kind = kind1 > kind2 ? kind1 : kind2;
9694 buf1 = PyUnicode_DATA(self);
9695 buf2 = PyUnicode_DATA(substring);
9696 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009697 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 if (!buf1)
9699 return NULL;
9700 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009701 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 if (!buf2) {
9703 if (kind1 != kind) PyMem_Free(buf1);
9704 return NULL;
9705 }
9706 len1 = PyUnicode_GET_LENGTH(self);
9707 len2 = PyUnicode_GET_LENGTH(substring);
9708
Benjamin Petersonead6b532011-12-20 17:23:42 -06009709 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009711 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9712 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009713 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009714 else
9715 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009716 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 break;
9718 case PyUnicode_2BYTE_KIND:
9719 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009720 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 break;
9722 case PyUnicode_4BYTE_KIND:
9723 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009724 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 break;
9726 default:
9727 out = NULL;
9728 }
9729 if (kind1 != kind)
9730 PyMem_Free(buf1);
9731 if (kind2 != kind)
9732 PyMem_Free(buf2);
9733 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734}
9735
Alexander Belopolsky40018472011-02-26 01:02:56 +00009736static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009737rsplit(PyObject *self,
9738 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009739 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009740{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741 int kind1, kind2, kind;
9742 void *buf1, *buf2;
9743 Py_ssize_t len1, len2;
9744 PyObject* out;
9745
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009746 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009747 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009749 if (PyUnicode_READY(self) == -1)
9750 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009753 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009754 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009755 if (PyUnicode_IS_ASCII(self))
9756 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009757 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009758 PyUnicode_GET_LENGTH(self), maxcount
9759 );
9760 else
9761 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009762 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009763 PyUnicode_GET_LENGTH(self), maxcount
9764 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009765 case PyUnicode_2BYTE_KIND:
9766 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009767 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 PyUnicode_GET_LENGTH(self), maxcount
9769 );
9770 case PyUnicode_4BYTE_KIND:
9771 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009772 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 PyUnicode_GET_LENGTH(self), maxcount
9774 );
9775 default:
9776 assert(0);
9777 return NULL;
9778 }
9779
9780 if (PyUnicode_READY(substring) == -1)
9781 return NULL;
9782
9783 kind1 = PyUnicode_KIND(self);
9784 kind2 = PyUnicode_KIND(substring);
9785 kind = kind1 > kind2 ? kind1 : kind2;
9786 buf1 = PyUnicode_DATA(self);
9787 buf2 = PyUnicode_DATA(substring);
9788 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009789 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 if (!buf1)
9791 return NULL;
9792 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009793 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 if (!buf2) {
9795 if (kind1 != kind) PyMem_Free(buf1);
9796 return NULL;
9797 }
9798 len1 = PyUnicode_GET_LENGTH(self);
9799 len2 = PyUnicode_GET_LENGTH(substring);
9800
Benjamin Petersonead6b532011-12-20 17:23:42 -06009801 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009803 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9804 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009805 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009806 else
9807 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009808 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 break;
9810 case PyUnicode_2BYTE_KIND:
9811 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009812 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 break;
9814 case PyUnicode_4BYTE_KIND:
9815 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009816 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 break;
9818 default:
9819 out = NULL;
9820 }
9821 if (kind1 != kind)
9822 PyMem_Free(buf1);
9823 if (kind2 != kind)
9824 PyMem_Free(buf2);
9825 return out;
9826}
9827
9828static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009829anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9830 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009832 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009834 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9835 return asciilib_find(buf1, len1, buf2, len2, offset);
9836 else
9837 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 case PyUnicode_2BYTE_KIND:
9839 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9840 case PyUnicode_4BYTE_KIND:
9841 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9842 }
9843 assert(0);
9844 return -1;
9845}
9846
9847static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009848anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9849 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009851 switch (kind) {
9852 case PyUnicode_1BYTE_KIND:
9853 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9854 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9855 else
9856 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9857 case PyUnicode_2BYTE_KIND:
9858 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9859 case PyUnicode_4BYTE_KIND:
9860 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9861 }
9862 assert(0);
9863 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009864}
9865
Alexander Belopolsky40018472011-02-26 01:02:56 +00009866static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867replace(PyObject *self, PyObject *str1,
9868 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 PyObject *u;
9871 char *sbuf = PyUnicode_DATA(self);
9872 char *buf1 = PyUnicode_DATA(str1);
9873 char *buf2 = PyUnicode_DATA(str2);
9874 int srelease = 0, release1 = 0, release2 = 0;
9875 int skind = PyUnicode_KIND(self);
9876 int kind1 = PyUnicode_KIND(str1);
9877 int kind2 = PyUnicode_KIND(str2);
9878 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9879 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9880 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009881 int mayshrink;
9882 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883
9884 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009885 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009887 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888
Victor Stinner59de0ee2011-10-07 10:01:28 +02009889 if (str1 == str2)
9890 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 if (skind < kind1)
9892 /* substring too wide to be present */
9893 goto nothing;
9894
Victor Stinner49a0a212011-10-12 23:46:10 +02009895 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9896 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9897 /* Replacing str1 with str2 may cause a maxchar reduction in the
9898 result string. */
9899 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +02009900 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009903 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009905 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009907 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009908 Py_UCS4 u1, u2;
9909 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +01009910 Py_ssize_t index, pos;
9911 char *src;
9912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +01009914 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9915 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009916 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009919 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +02009921 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +01009923
9924 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9925 index = 0;
9926 src = sbuf;
9927 while (--maxcount)
9928 {
9929 pos++;
9930 src += pos * PyUnicode_KIND(self);
9931 slen -= pos;
9932 index += pos;
9933 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9934 if (pos < 0)
9935 break;
9936 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9937 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009938 }
9939 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 int rkind = skind;
9941 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009942 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 if (kind1 < rkind) {
9945 /* widen substring */
9946 buf1 = _PyUnicode_AsKind(str1, rkind);
9947 if (!buf1) goto error;
9948 release1 = 1;
9949 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009950 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009951 if (i < 0)
9952 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 if (rkind > kind2) {
9954 /* widen replacement */
9955 buf2 = _PyUnicode_AsKind(str2, rkind);
9956 if (!buf2) goto error;
9957 release2 = 1;
9958 }
9959 else if (rkind < kind2) {
9960 /* widen self and buf1 */
9961 rkind = kind2;
9962 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +01009963 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 sbuf = _PyUnicode_AsKind(self, rkind);
9965 if (!sbuf) goto error;
9966 srelease = 1;
9967 buf1 = _PyUnicode_AsKind(str1, rkind);
9968 if (!buf1) goto error;
9969 release1 = 1;
9970 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009971 u = PyUnicode_New(slen, maxchar);
9972 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009974 assert(PyUnicode_KIND(u) == rkind);
9975 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009976
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009977 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009978 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009979 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009981 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009983
9984 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009985 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009986 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009987 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009988 if (i == -1)
9989 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009990 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009992 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009994 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009996 }
9997 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01009999 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 int rkind = skind;
10001 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010004 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 buf1 = _PyUnicode_AsKind(str1, rkind);
10006 if (!buf1) goto error;
10007 release1 = 1;
10008 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010009 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010010 if (n == 0)
10011 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010013 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 buf2 = _PyUnicode_AsKind(str2, rkind);
10015 if (!buf2) goto error;
10016 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010019 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 rkind = kind2;
10021 sbuf = _PyUnicode_AsKind(self, rkind);
10022 if (!sbuf) goto error;
10023 srelease = 1;
10024 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010025 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 buf1 = _PyUnicode_AsKind(str1, rkind);
10027 if (!buf1) goto error;
10028 release1 = 1;
10029 }
10030 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10031 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010032 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 PyErr_SetString(PyExc_OverflowError,
10034 "replace string is too long");
10035 goto error;
10036 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010037 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010038 if (new_size == 0) {
10039 Py_INCREF(unicode_empty);
10040 u = unicode_empty;
10041 goto done;
10042 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010043 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 PyErr_SetString(PyExc_OverflowError,
10045 "replace string is too long");
10046 goto error;
10047 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010048 u = PyUnicode_New(new_size, maxchar);
10049 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010051 assert(PyUnicode_KIND(u) == rkind);
10052 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 ires = i = 0;
10054 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010055 while (n-- > 0) {
10056 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010057 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010058 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010059 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010060 if (j == -1)
10061 break;
10062 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010063 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010064 memcpy(res + rkind * ires,
10065 sbuf + rkind * i,
10066 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010068 }
10069 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010071 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010073 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010079 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010080 memcpy(res + rkind * ires,
10081 sbuf + rkind * i,
10082 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010083 }
10084 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010085 /* interleave */
10086 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010087 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010089 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010091 if (--n <= 0)
10092 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010093 memcpy(res + rkind * ires,
10094 sbuf + rkind * i,
10095 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 ires++;
10097 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010098 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010099 memcpy(res + rkind * ires,
10100 sbuf + rkind * i,
10101 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010102 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010103 }
10104
10105 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010106 unicode_adjust_maxchar(&u);
10107 if (u == NULL)
10108 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010109 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010110
10111 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 if (srelease)
10113 PyMem_FREE(sbuf);
10114 if (release1)
10115 PyMem_FREE(buf1);
10116 if (release2)
10117 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010118 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010120
Benjamin Peterson29060642009-01-31 22:14:21 +000010121 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010122 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 if (srelease)
10124 PyMem_FREE(sbuf);
10125 if (release1)
10126 PyMem_FREE(buf1);
10127 if (release2)
10128 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010129 return unicode_result_unchanged(self);
10130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 error:
10132 if (srelease && sbuf)
10133 PyMem_FREE(sbuf);
10134 if (release1 && buf1)
10135 PyMem_FREE(buf1);
10136 if (release2 && buf2)
10137 PyMem_FREE(buf2);
10138 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139}
10140
10141/* --- Unicode Object Methods --------------------------------------------- */
10142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010143PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010144 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145\n\
10146Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010147characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148
10149static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010150unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010152 if (PyUnicode_READY(self) == -1)
10153 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010154 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155}
10156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010157PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010158 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159\n\
10160Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010161have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162
10163static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010164unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010166 if (PyUnicode_READY(self) == -1)
10167 return NULL;
10168 if (PyUnicode_GET_LENGTH(self) == 0)
10169 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010170 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171}
10172
Benjamin Petersond5890c82012-01-14 13:23:30 -050010173PyDoc_STRVAR(casefold__doc__,
10174 "S.casefold() -> str\n\
10175\n\
10176Return a version of S suitable for caseless comparisons.");
10177
10178static PyObject *
10179unicode_casefold(PyObject *self)
10180{
10181 if (PyUnicode_READY(self) == -1)
10182 return NULL;
10183 if (PyUnicode_IS_ASCII(self))
10184 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010185 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010186}
10187
10188
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010189/* Argument converter. Coerces to a single unicode character */
10190
10191static int
10192convert_uc(PyObject *obj, void *addr)
10193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010195 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010196
Benjamin Peterson14339b62009-01-31 16:36:08 +000010197 uniobj = PyUnicode_FromObject(obj);
10198 if (uniobj == NULL) {
10199 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010200 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010201 return 0;
10202 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010204 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010205 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010206 Py_DECREF(uniobj);
10207 return 0;
10208 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010210 Py_DECREF(uniobj);
10211 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010212}
10213
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010214PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010215 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010217Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010218done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219
10220static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010221unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010223 Py_ssize_t marg, left;
10224 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 Py_UCS4 fillchar = ' ';
10226
Victor Stinnere9a29352011-10-01 02:14:59 +020010227 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229
Benjamin Petersonbac79492012-01-14 13:34:47 -050010230 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231 return NULL;
10232
Victor Stinnerc4b49542011-12-11 22:44:26 +010010233 if (PyUnicode_GET_LENGTH(self) >= width)
10234 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235
Victor Stinnerc4b49542011-12-11 22:44:26 +010010236 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237 left = marg / 2 + (marg & width & 1);
10238
Victor Stinner9310abb2011-10-05 00:59:23 +020010239 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240}
10241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242/* This function assumes that str1 and str2 are readied by the caller. */
10243
Marc-André Lemburge5034372000-08-08 08:04:29 +000010244static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010245unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010246{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 int kind1, kind2;
10248 void *data1, *data2;
Victor Stinner770e19e2012-10-04 22:59:45 +020010249 Py_ssize_t len1, len2;
10250 Py_ssize_t i, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010251
Victor Stinner90db9c42012-10-04 21:53:50 +020010252 /* a string is equal to itself */
10253 if (str1 == str2)
10254 return 0;
10255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 kind1 = PyUnicode_KIND(str1);
10257 kind2 = PyUnicode_KIND(str2);
10258 data1 = PyUnicode_DATA(str1);
10259 data2 = PyUnicode_DATA(str2);
10260 len1 = PyUnicode_GET_LENGTH(str1);
10261 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010262 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010263
Victor Stinner770e19e2012-10-04 22:59:45 +020010264 if (kind1 == 1 && kind2 == 1) {
10265 int cmp = memcmp(data1, data2, len);
10266 /* normalize result of memcmp() into the range [-1; 1] */
10267 if (cmp < 0)
10268 return -1;
10269 if (cmp > 0)
10270 return 1;
10271 }
10272 else {
10273 for (i = 0; i < len; ++i) {
10274 Py_UCS4 c1, c2;
10275 c1 = PyUnicode_READ(kind1, data1, i);
10276 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010277
Victor Stinner770e19e2012-10-04 22:59:45 +020010278 if (c1 != c2)
10279 return (c1 < c2) ? -1 : 1;
10280 }
Marc-André Lemburge5034372000-08-08 08:04:29 +000010281 }
10282
Victor Stinner770e19e2012-10-04 22:59:45 +020010283 if (len1 == len2)
10284 return 0;
10285 if (len1 < len2)
10286 return -1;
10287 else
10288 return 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010289}
10290
Victor Stinnere5567ad2012-10-23 02:48:49 +020010291static int
10292unicode_compare_eq(PyObject *str1, PyObject *str2)
10293{
10294 int kind;
10295 void *data1, *data2;
10296 Py_ssize_t len;
10297 int cmp;
10298
10299 /* a string is equal to itself */
10300 if (str1 == str2)
10301 return 1;
10302
10303 len = PyUnicode_GET_LENGTH(str1);
10304 if (PyUnicode_GET_LENGTH(str2) != len)
10305 return 0;
10306 kind = PyUnicode_KIND(str1);
10307 if (PyUnicode_KIND(str2) != kind)
10308 return 0;
10309 data1 = PyUnicode_DATA(str1);
10310 data2 = PyUnicode_DATA(str2);
10311
10312 cmp = memcmp(data1, data2, len * kind);
10313 return (cmp == 0);
10314}
10315
10316
Alexander Belopolsky40018472011-02-26 01:02:56 +000010317int
10318PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10321 if (PyUnicode_READY(left) == -1 ||
10322 PyUnicode_READY(right) == -1)
10323 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010324 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010326 PyErr_Format(PyExc_TypeError,
10327 "Can't compare %.100s and %.100s",
10328 left->ob_type->tp_name,
10329 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330 return -1;
10331}
10332
Martin v. Löwis5b222132007-06-10 09:51:05 +000010333int
10334PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10335{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 Py_ssize_t i;
10337 int kind;
10338 void *data;
10339 Py_UCS4 chr;
10340
Victor Stinner910337b2011-10-03 03:20:16 +020010341 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 if (PyUnicode_READY(uni) == -1)
10343 return -1;
10344 kind = PyUnicode_KIND(uni);
10345 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010346 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10348 if (chr != str[i])
10349 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010350 /* This check keeps Python strings that end in '\0' from comparing equal
10351 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010353 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010354 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010355 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010356 return 0;
10357}
10358
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010359
Benjamin Peterson29060642009-01-31 22:14:21 +000010360#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010361 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010362
Alexander Belopolsky40018472011-02-26 01:02:56 +000010363PyObject *
10364PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010365{
10366 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010367 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010368
Victor Stinnere5567ad2012-10-23 02:48:49 +020010369 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10370 Py_RETURN_NOTIMPLEMENTED;
10371
10372 if (PyUnicode_READY(left) == -1 ||
10373 PyUnicode_READY(right) == -1)
10374 return NULL;
10375
10376 if (op == Py_EQ || op == Py_NE) {
10377 result = unicode_compare_eq(left, right);
10378 if (op == Py_EQ)
10379 v = TEST_COND(result);
10380 else
10381 v = TEST_COND(!result);
10382 }
10383 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010384 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010385
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010386 /* Convert the return value to a Boolean */
10387 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010388 case Py_LE:
10389 v = TEST_COND(result <= 0);
10390 break;
10391 case Py_GE:
10392 v = TEST_COND(result >= 0);
10393 break;
10394 case Py_LT:
10395 v = TEST_COND(result == -1);
10396 break;
10397 case Py_GT:
10398 v = TEST_COND(result == 1);
10399 break;
10400 default:
10401 PyErr_BadArgument();
10402 return NULL;
10403 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010404 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010405 Py_INCREF(v);
10406 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010407}
10408
Alexander Belopolsky40018472011-02-26 01:02:56 +000010409int
10410PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010411{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010412 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 int kind1, kind2, kind;
10414 void *buf1, *buf2;
10415 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010416 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010417
10418 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010419 sub = PyUnicode_FromObject(element);
10420 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010421 PyErr_Format(PyExc_TypeError,
10422 "'in <string>' requires string as left operand, not %s",
10423 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010424 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010425 }
10426
Thomas Wouters477c8d52006-05-27 19:21:47 +000010427 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010428 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010429 Py_DECREF(sub);
10430 return -1;
10431 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010432 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10433 Py_DECREF(sub);
10434 Py_DECREF(str);
10435 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 kind1 = PyUnicode_KIND(str);
10438 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010439 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 buf1 = PyUnicode_DATA(str);
10441 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010442 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010443 if (kind2 > kind) {
10444 Py_DECREF(sub);
10445 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010446 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010447 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010448 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 if (!buf2) {
10451 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010452 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 return -1;
10454 }
10455 len1 = PyUnicode_GET_LENGTH(str);
10456 len2 = PyUnicode_GET_LENGTH(sub);
10457
Benjamin Petersonead6b532011-12-20 17:23:42 -060010458 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 case PyUnicode_1BYTE_KIND:
10460 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10461 break;
10462 case PyUnicode_2BYTE_KIND:
10463 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10464 break;
10465 case PyUnicode_4BYTE_KIND:
10466 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10467 break;
10468 default:
10469 result = -1;
10470 assert(0);
10471 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010472
10473 Py_DECREF(str);
10474 Py_DECREF(sub);
10475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 if (kind2 != kind)
10477 PyMem_Free(buf2);
10478
Guido van Rossum403d68b2000-03-13 15:55:09 +000010479 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010480}
10481
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482/* Concat to string or Unicode object giving a new Unicode object. */
10483
Alexander Belopolsky40018472011-02-26 01:02:56 +000010484PyObject *
10485PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010488 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010489 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490
10491 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010494 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010497 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498
10499 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010500 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010501 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010504 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010505 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010507 }
10508
Victor Stinner488fa492011-12-12 00:01:39 +010010509 u_len = PyUnicode_GET_LENGTH(u);
10510 v_len = PyUnicode_GET_LENGTH(v);
10511 if (u_len > PY_SSIZE_T_MAX - v_len) {
10512 PyErr_SetString(PyExc_OverflowError,
10513 "strings are too large to concat");
10514 goto onError;
10515 }
10516 new_len = u_len + v_len;
10517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010519 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010520 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010523 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010525 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010526 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10527 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528 Py_DECREF(u);
10529 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010530 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532
Benjamin Peterson29060642009-01-31 22:14:21 +000010533 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534 Py_XDECREF(u);
10535 Py_XDECREF(v);
10536 return NULL;
10537}
10538
Walter Dörwald1ab83302007-05-18 17:15:44 +000010539void
Victor Stinner23e56682011-10-03 03:54:37 +020010540PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010541{
Victor Stinner23e56682011-10-03 03:54:37 +020010542 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010543 Py_UCS4 maxchar, maxchar2;
10544 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010545
10546 if (p_left == NULL) {
10547 if (!PyErr_Occurred())
10548 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010549 return;
10550 }
Victor Stinner23e56682011-10-03 03:54:37 +020010551 left = *p_left;
10552 if (right == NULL || !PyUnicode_Check(left)) {
10553 if (!PyErr_Occurred())
10554 PyErr_BadInternalCall();
10555 goto error;
10556 }
10557
Benjamin Petersonbac79492012-01-14 13:34:47 -050010558 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010559 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010560 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010561 goto error;
10562
Victor Stinner488fa492011-12-12 00:01:39 +010010563 /* Shortcuts */
10564 if (left == unicode_empty) {
10565 Py_DECREF(left);
10566 Py_INCREF(right);
10567 *p_left = right;
10568 return;
10569 }
10570 if (right == unicode_empty)
10571 return;
10572
10573 left_len = PyUnicode_GET_LENGTH(left);
10574 right_len = PyUnicode_GET_LENGTH(right);
10575 if (left_len > PY_SSIZE_T_MAX - right_len) {
10576 PyErr_SetString(PyExc_OverflowError,
10577 "strings are too large to concat");
10578 goto error;
10579 }
10580 new_len = left_len + right_len;
10581
10582 if (unicode_modifiable(left)
10583 && PyUnicode_CheckExact(right)
10584 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010585 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10586 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010587 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010588 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010589 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10590 {
10591 /* append inplace */
10592 if (unicode_resize(p_left, new_len) != 0) {
10593 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10594 * deallocated so it cannot be put back into
10595 * 'variable'. The MemoryError is raised when there
10596 * is no value in 'variable', which might (very
10597 * remotely) be a cause of incompatibilities.
10598 */
10599 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010600 }
Victor Stinner488fa492011-12-12 00:01:39 +010010601 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010602 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010603 }
Victor Stinner488fa492011-12-12 00:01:39 +010010604 else {
10605 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10606 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010607 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010608
Victor Stinner488fa492011-12-12 00:01:39 +010010609 /* Concat the two Unicode strings */
10610 res = PyUnicode_New(new_len, maxchar);
10611 if (res == NULL)
10612 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010613 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10614 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010615 Py_DECREF(left);
10616 *p_left = res;
10617 }
10618 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010619 return;
10620
10621error:
Victor Stinner488fa492011-12-12 00:01:39 +010010622 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010623}
10624
10625void
10626PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10627{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010628 PyUnicode_Append(pleft, right);
10629 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010630}
10631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010632PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010633 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010634\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010635Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010636string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010637interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638
10639static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010640unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010642 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010643 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010644 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 int kind1, kind2, kind;
10647 void *buf1, *buf2;
10648 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649
Jesus Ceaac451502011-04-20 17:09:23 +020010650 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10651 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010652 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 kind1 = PyUnicode_KIND(self);
10655 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010656 if (kind2 > kind1)
10657 return PyLong_FromLong(0);
10658 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 buf1 = PyUnicode_DATA(self);
10660 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010662 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 if (!buf2) {
10664 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 return NULL;
10666 }
10667 len1 = PyUnicode_GET_LENGTH(self);
10668 len2 = PyUnicode_GET_LENGTH(substring);
10669
10670 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010671 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 case PyUnicode_1BYTE_KIND:
10673 iresult = ucs1lib_count(
10674 ((Py_UCS1*)buf1) + start, end - start,
10675 buf2, len2, PY_SSIZE_T_MAX
10676 );
10677 break;
10678 case PyUnicode_2BYTE_KIND:
10679 iresult = ucs2lib_count(
10680 ((Py_UCS2*)buf1) + start, end - start,
10681 buf2, len2, PY_SSIZE_T_MAX
10682 );
10683 break;
10684 case PyUnicode_4BYTE_KIND:
10685 iresult = ucs4lib_count(
10686 ((Py_UCS4*)buf1) + start, end - start,
10687 buf2, len2, PY_SSIZE_T_MAX
10688 );
10689 break;
10690 default:
10691 assert(0); iresult = 0;
10692 }
10693
10694 result = PyLong_FromSsize_t(iresult);
10695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 if (kind2 != kind)
10697 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698
10699 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010700
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701 return result;
10702}
10703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010704PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010705 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010707Encode S using the codec registered for encoding. Default encoding\n\
10708is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010709handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010710a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10711'xmlcharrefreplace' as well as any other name registered with\n\
10712codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713
10714static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010715unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010717 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718 char *encoding = NULL;
10719 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010720
Benjamin Peterson308d6372009-09-18 21:42:35 +000010721 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10722 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010724 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010725}
10726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010727PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010728 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729\n\
10730Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010731If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732
10733static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010734unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010736 Py_ssize_t i, j, line_pos, src_len, incr;
10737 Py_UCS4 ch;
10738 PyObject *u;
10739 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010741 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010742 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743
10744 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010745 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746
Antoine Pitrou22425222011-10-04 19:10:51 +020010747 if (PyUnicode_READY(self) == -1)
10748 return NULL;
10749
Thomas Wouters7e474022000-07-16 12:04:32 +000010750 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010751 src_len = PyUnicode_GET_LENGTH(self);
10752 i = j = line_pos = 0;
10753 kind = PyUnicode_KIND(self);
10754 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010755 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010756 for (; i < src_len; i++) {
10757 ch = PyUnicode_READ(kind, src_data, i);
10758 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010759 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010760 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010761 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010762 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010763 goto overflow;
10764 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010765 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010766 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010769 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010770 goto overflow;
10771 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010773 if (ch == '\n' || ch == '\r')
10774 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010776 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010777 if (!found)
10778 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010779
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010781 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782 if (!u)
10783 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010784 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785
Antoine Pitroue71d5742011-10-04 15:55:09 +020010786 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787
Antoine Pitroue71d5742011-10-04 15:55:09 +020010788 for (; i < src_len; i++) {
10789 ch = PyUnicode_READ(kind, src_data, i);
10790 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010791 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010792 incr = tabsize - (line_pos % tabsize);
10793 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010794 FILL(kind, dest_data, ' ', j, incr);
10795 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010796 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010797 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010798 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010799 line_pos++;
10800 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010801 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010802 if (ch == '\n' || ch == '\r')
10803 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010805 }
10806 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010807 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010808
Antoine Pitroue71d5742011-10-04 15:55:09 +020010809 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010810 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10811 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812}
10813
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010814PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010815 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816\n\
10817Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010818such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819arguments start and end are interpreted as in slice notation.\n\
10820\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010821Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822
10823static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010826 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010827 Py_ssize_t start;
10828 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010829 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830
Jesus Ceaac451502011-04-20 17:09:23 +020010831 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10832 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 if (PyUnicode_READY(self) == -1)
10836 return NULL;
10837 if (PyUnicode_READY(substring) == -1)
10838 return NULL;
10839
Victor Stinner7931d9a2011-11-04 00:22:48 +010010840 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
10842 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 if (result == -2)
10845 return NULL;
10846
Christian Heimes217cfd12007-12-02 14:31:20 +000010847 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848}
10849
10850static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010851unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010853 void *data;
10854 enum PyUnicode_Kind kind;
10855 Py_UCS4 ch;
10856 PyObject *res;
10857
10858 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10859 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010861 }
10862 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10863 PyErr_SetString(PyExc_IndexError, "string index out of range");
10864 return NULL;
10865 }
10866 kind = PyUnicode_KIND(self);
10867 data = PyUnicode_DATA(self);
10868 ch = PyUnicode_READ(kind, data, index);
10869 if (ch < 256)
10870 return get_latin1_char(ch);
10871
10872 res = PyUnicode_New(1, ch);
10873 if (res == NULL)
10874 return NULL;
10875 kind = PyUnicode_KIND(res);
10876 data = PyUnicode_DATA(res);
10877 PyUnicode_WRITE(kind, data, 0, ch);
10878 assert(_PyUnicode_CheckConsistency(res, 1));
10879 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880}
10881
Guido van Rossumc2504932007-09-18 19:42:40 +000010882/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010883 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010884static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010885unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886{
Guido van Rossumc2504932007-09-18 19:42:40 +000010887 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080010888 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000010889
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010890#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050010891 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010892#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 if (_PyUnicode_HASH(self) != -1)
10894 return _PyUnicode_HASH(self);
10895 if (PyUnicode_READY(self) == -1)
10896 return -1;
10897 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010010898 /*
10899 We make the hash of the empty string be 0, rather than using
10900 (prefix ^ suffix), since this slightly obfuscates the hash secret
10901 */
10902 if (len == 0) {
10903 _PyUnicode_HASH(self) = 0;
10904 return 0;
10905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906
10907 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010010908#define HASH(P) \
10909 x ^= (Py_uhash_t) *P << 7; \
10910 while (--len >= 0) \
10911 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912
Georg Brandl2fb477c2012-02-21 00:33:36 +010010913 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 switch (PyUnicode_KIND(self)) {
10915 case PyUnicode_1BYTE_KIND: {
10916 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10917 HASH(c);
10918 break;
10919 }
10920 case PyUnicode_2BYTE_KIND: {
10921 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10922 HASH(s);
10923 break;
10924 }
10925 default: {
10926 Py_UCS4 *l;
10927 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10928 "Impossible switch case in unicode_hash");
10929 l = PyUnicode_4BYTE_DATA(self);
10930 HASH(l);
10931 break;
10932 }
10933 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010010934 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
10935 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936
Guido van Rossumc2504932007-09-18 19:42:40 +000010937 if (x == -1)
10938 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010940 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010944PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010945 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010947Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948
10949static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010952 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010953 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010954 Py_ssize_t start;
10955 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956
Jesus Ceaac451502011-04-20 17:09:23 +020010957 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10958 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961 if (PyUnicode_READY(self) == -1)
10962 return NULL;
10963 if (PyUnicode_READY(substring) == -1)
10964 return NULL;
10965
Victor Stinner7931d9a2011-11-04 00:22:48 +010010966 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967
10968 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 if (result == -2)
10971 return NULL;
10972
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973 if (result < 0) {
10974 PyErr_SetString(PyExc_ValueError, "substring not found");
10975 return NULL;
10976 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010977
Christian Heimes217cfd12007-12-02 14:31:20 +000010978 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979}
10980
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010981PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010982 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010984Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010985at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986
10987static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010988unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990 Py_ssize_t i, length;
10991 int kind;
10992 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993 int cased;
10994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 if (PyUnicode_READY(self) == -1)
10996 return NULL;
10997 length = PyUnicode_GET_LENGTH(self);
10998 kind = PyUnicode_KIND(self);
10999 data = PyUnicode_DATA(self);
11000
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002 if (length == 1)
11003 return PyBool_FromLong(
11004 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011006 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011008 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011009
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 for (i = 0; i < length; i++) {
11012 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011013
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11015 return PyBool_FromLong(0);
11016 else if (!cased && Py_UNICODE_ISLOWER(ch))
11017 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011019 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020}
11021
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011022PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011025Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011026at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027
11028static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011029unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031 Py_ssize_t i, length;
11032 int kind;
11033 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034 int cased;
11035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 if (PyUnicode_READY(self) == -1)
11037 return NULL;
11038 length = PyUnicode_GET_LENGTH(self);
11039 kind = PyUnicode_KIND(self);
11040 data = PyUnicode_DATA(self);
11041
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011043 if (length == 1)
11044 return PyBool_FromLong(
11045 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011047 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011049 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011050
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 for (i = 0; i < length; i++) {
11053 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011054
Benjamin Peterson29060642009-01-31 22:14:21 +000011055 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11056 return PyBool_FromLong(0);
11057 else if (!cased && Py_UNICODE_ISUPPER(ch))
11058 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011060 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061}
11062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011063PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011064 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011066Return True if S is a titlecased string and there is at least one\n\
11067character in S, i.e. upper- and titlecase characters may only\n\
11068follow uncased characters and lowercase characters only cased ones.\n\
11069Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070
11071static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011072unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 Py_ssize_t i, length;
11075 int kind;
11076 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077 int cased, previous_is_cased;
11078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011079 if (PyUnicode_READY(self) == -1)
11080 return NULL;
11081 length = PyUnicode_GET_LENGTH(self);
11082 kind = PyUnicode_KIND(self);
11083 data = PyUnicode_DATA(self);
11084
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 if (length == 1) {
11087 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11088 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11089 (Py_UNICODE_ISUPPER(ch) != 0));
11090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011092 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011094 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011095
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096 cased = 0;
11097 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 for (i = 0; i < length; i++) {
11099 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011100
Benjamin Peterson29060642009-01-31 22:14:21 +000011101 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11102 if (previous_is_cased)
11103 return PyBool_FromLong(0);
11104 previous_is_cased = 1;
11105 cased = 1;
11106 }
11107 else if (Py_UNICODE_ISLOWER(ch)) {
11108 if (!previous_is_cased)
11109 return PyBool_FromLong(0);
11110 previous_is_cased = 1;
11111 cased = 1;
11112 }
11113 else
11114 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011116 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117}
11118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011119PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011120 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011122Return True if all characters in S are whitespace\n\
11123and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124
11125static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011126unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011128 Py_ssize_t i, length;
11129 int kind;
11130 void *data;
11131
11132 if (PyUnicode_READY(self) == -1)
11133 return NULL;
11134 length = PyUnicode_GET_LENGTH(self);
11135 kind = PyUnicode_KIND(self);
11136 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 if (length == 1)
11140 return PyBool_FromLong(
11141 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011143 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011145 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 for (i = 0; i < length; i++) {
11148 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011149 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011150 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011152 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153}
11154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011155PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011156 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011157\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011158Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011159and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011160
11161static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011162unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011163{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 Py_ssize_t i, length;
11165 int kind;
11166 void *data;
11167
11168 if (PyUnicode_READY(self) == -1)
11169 return NULL;
11170 length = PyUnicode_GET_LENGTH(self);
11171 kind = PyUnicode_KIND(self);
11172 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011173
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011174 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 if (length == 1)
11176 return PyBool_FromLong(
11177 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011178
11179 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011181 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 for (i = 0; i < length; i++) {
11184 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011186 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011187 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011188}
11189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011190PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011191 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011192\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011193Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011194and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011195
11196static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011197unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011198{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 int kind;
11200 void *data;
11201 Py_ssize_t len, i;
11202
11203 if (PyUnicode_READY(self) == -1)
11204 return NULL;
11205
11206 kind = PyUnicode_KIND(self);
11207 data = PyUnicode_DATA(self);
11208 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011209
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011210 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 if (len == 1) {
11212 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11213 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11214 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011215
11216 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011218 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 for (i = 0; i < len; i++) {
11221 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011222 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011223 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011224 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011225 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011226}
11227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011228PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011229 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011231Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011232False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233
11234static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011235unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 Py_ssize_t i, length;
11238 int kind;
11239 void *data;
11240
11241 if (PyUnicode_READY(self) == -1)
11242 return NULL;
11243 length = PyUnicode_GET_LENGTH(self);
11244 kind = PyUnicode_KIND(self);
11245 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248 if (length == 1)
11249 return PyBool_FromLong(
11250 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011252 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011254 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 for (i = 0; i < length; i++) {
11257 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011260 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261}
11262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011263PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011266Return True if all characters in S are digits\n\
11267and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268
11269static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011270unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272 Py_ssize_t i, length;
11273 int kind;
11274 void *data;
11275
11276 if (PyUnicode_READY(self) == -1)
11277 return NULL;
11278 length = PyUnicode_GET_LENGTH(self);
11279 kind = PyUnicode_KIND(self);
11280 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 if (length == 1) {
11284 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11285 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11286 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011288 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011290 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011292 for (i = 0; i < length; i++) {
11293 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011294 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011296 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297}
11298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011299PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011302Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011303False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304
11305static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011306unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 Py_ssize_t i, length;
11309 int kind;
11310 void *data;
11311
11312 if (PyUnicode_READY(self) == -1)
11313 return NULL;
11314 length = PyUnicode_GET_LENGTH(self);
11315 kind = PyUnicode_KIND(self);
11316 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 if (length == 1)
11320 return PyBool_FromLong(
11321 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011323 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011325 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 for (i = 0; i < length; i++) {
11328 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011329 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011331 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332}
11333
Martin v. Löwis47383402007-08-15 07:32:56 +000011334int
11335PyUnicode_IsIdentifier(PyObject *self)
11336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 int kind;
11338 void *data;
11339 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011340 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 if (PyUnicode_READY(self) == -1) {
11343 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 }
11346
11347 /* Special case for empty strings */
11348 if (PyUnicode_GET_LENGTH(self) == 0)
11349 return 0;
11350 kind = PyUnicode_KIND(self);
11351 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011352
11353 /* PEP 3131 says that the first character must be in
11354 XID_Start and subsequent characters in XID_Continue,
11355 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011356 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011357 letters, digits, underscore). However, given the current
11358 definition of XID_Start and XID_Continue, it is sufficient
11359 to check just for these, except that _ must be allowed
11360 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011362 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011363 return 0;
11364
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011365 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011366 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011367 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011368 return 1;
11369}
11370
11371PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011372 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011373\n\
11374Return True if S is a valid identifier according\n\
11375to the language definition.");
11376
11377static PyObject*
11378unicode_isidentifier(PyObject *self)
11379{
11380 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11381}
11382
Georg Brandl559e5d72008-06-11 18:37:52 +000011383PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011384 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011385\n\
11386Return True if all characters in S are considered\n\
11387printable in repr() or S is empty, False otherwise.");
11388
11389static PyObject*
11390unicode_isprintable(PyObject *self)
11391{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 Py_ssize_t i, length;
11393 int kind;
11394 void *data;
11395
11396 if (PyUnicode_READY(self) == -1)
11397 return NULL;
11398 length = PyUnicode_GET_LENGTH(self);
11399 kind = PyUnicode_KIND(self);
11400 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011401
11402 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 if (length == 1)
11404 return PyBool_FromLong(
11405 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 for (i = 0; i < length; i++) {
11408 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011409 Py_RETURN_FALSE;
11410 }
11411 }
11412 Py_RETURN_TRUE;
11413}
11414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011415PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011416 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417\n\
11418Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011419iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420
11421static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011422unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011424 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425}
11426
Martin v. Löwis18e16552006-02-15 17:27:45 +000011427static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011428unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 if (PyUnicode_READY(self) == -1)
11431 return -1;
11432 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433}
11434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011435PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011438Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011439done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440
11441static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011442unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011444 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 Py_UCS4 fillchar = ' ';
11446
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011447 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448 return NULL;
11449
Benjamin Petersonbac79492012-01-14 13:34:47 -050011450 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452
Victor Stinnerc4b49542011-12-11 22:44:26 +010011453 if (PyUnicode_GET_LENGTH(self) >= width)
11454 return unicode_result_unchanged(self);
11455
11456 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457}
11458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011459PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011460 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011462Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463
11464static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011465unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011467 if (PyUnicode_READY(self) == -1)
11468 return NULL;
11469 if (PyUnicode_IS_ASCII(self))
11470 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011471 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472}
11473
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011474#define LEFTSTRIP 0
11475#define RIGHTSTRIP 1
11476#define BOTHSTRIP 2
11477
11478/* Arrays indexed by above */
11479static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11480
11481#define STRIPNAME(i) (stripformat[i]+3)
11482
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011483/* externally visible for str.strip(unicode) */
11484PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011485_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 void *data;
11488 int kind;
11489 Py_ssize_t i, j, len;
11490 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11493 return NULL;
11494
11495 kind = PyUnicode_KIND(self);
11496 data = PyUnicode_DATA(self);
11497 len = PyUnicode_GET_LENGTH(self);
11498 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11499 PyUnicode_DATA(sepobj),
11500 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011501
Benjamin Peterson14339b62009-01-31 16:36:08 +000011502 i = 0;
11503 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 while (i < len &&
11505 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 i++;
11507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011508 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011509
Benjamin Peterson14339b62009-01-31 16:36:08 +000011510 j = len;
11511 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 do {
11513 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514 } while (j >= i &&
11515 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011517 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011518
Victor Stinner7931d9a2011-11-04 00:22:48 +010011519 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520}
11521
11522PyObject*
11523PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11524{
11525 unsigned char *data;
11526 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011527 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528
Victor Stinnerde636f32011-10-01 03:55:54 +020011529 if (PyUnicode_READY(self) == -1)
11530 return NULL;
11531
Victor Stinner684d5fd2012-05-03 02:32:34 +020011532 length = PyUnicode_GET_LENGTH(self);
11533 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011534
Victor Stinner684d5fd2012-05-03 02:32:34 +020011535 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011536 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537
Victor Stinnerde636f32011-10-01 03:55:54 +020011538 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011539 PyErr_SetString(PyExc_IndexError, "string index out of range");
11540 return NULL;
11541 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011542 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011543 Py_INCREF(unicode_empty);
11544 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011545 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011546
Victor Stinner684d5fd2012-05-03 02:32:34 +020011547 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011548 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011549 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011550 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011551 }
11552 else {
11553 kind = PyUnicode_KIND(self);
11554 data = PyUnicode_1BYTE_DATA(self);
11555 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011556 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011557 length);
11558 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560
11561static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011562do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 int kind;
11565 void *data;
11566 Py_ssize_t len, i, j;
11567
11568 if (PyUnicode_READY(self) == -1)
11569 return NULL;
11570
11571 kind = PyUnicode_KIND(self);
11572 data = PyUnicode_DATA(self);
11573 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011574
Benjamin Peterson14339b62009-01-31 16:36:08 +000011575 i = 0;
11576 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011578 i++;
11579 }
11580 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011581
Benjamin Peterson14339b62009-01-31 16:36:08 +000011582 j = len;
11583 if (striptype != LEFTSTRIP) {
11584 do {
11585 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011587 j++;
11588 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011589
Victor Stinner7931d9a2011-11-04 00:22:48 +010011590 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591}
11592
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011593
11594static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011595do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011596{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011597 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011598
Benjamin Peterson14339b62009-01-31 16:36:08 +000011599 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11600 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011601
Benjamin Peterson14339b62009-01-31 16:36:08 +000011602 if (sep != NULL && sep != Py_None) {
11603 if (PyUnicode_Check(sep))
11604 return _PyUnicode_XStrip(self, striptype, sep);
11605 else {
11606 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011607 "%s arg must be None or str",
11608 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011609 return NULL;
11610 }
11611 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011612
Benjamin Peterson14339b62009-01-31 16:36:08 +000011613 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011614}
11615
11616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011617PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011618 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011619\n\
11620Return a copy of the string S with leading and trailing\n\
11621whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011622If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011623
11624static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011625unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011626{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011627 if (PyTuple_GET_SIZE(args) == 0)
11628 return do_strip(self, BOTHSTRIP); /* Common case */
11629 else
11630 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011631}
11632
11633
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011634PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011635 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011636\n\
11637Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011638If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011639
11640static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011641unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011642{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011643 if (PyTuple_GET_SIZE(args) == 0)
11644 return do_strip(self, LEFTSTRIP); /* Common case */
11645 else
11646 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011647}
11648
11649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011650PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011651 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011652\n\
11653Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011654If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011655
11656static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011657unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011658{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011659 if (PyTuple_GET_SIZE(args) == 0)
11660 return do_strip(self, RIGHTSTRIP); /* Common case */
11661 else
11662 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011663}
11664
11665
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011667unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011669 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671
Georg Brandl222de0f2009-04-12 12:01:50 +000011672 if (len < 1) {
11673 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011674 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011675 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676
Victor Stinnerc4b49542011-12-11 22:44:26 +010011677 /* no repeat, return original string */
11678 if (len == 1)
11679 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011680
Benjamin Petersonbac79492012-01-14 13:34:47 -050011681 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 return NULL;
11683
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011684 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011685 PyErr_SetString(PyExc_OverflowError,
11686 "repeated string is too long");
11687 return NULL;
11688 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011690
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011691 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692 if (!u)
11693 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011694 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 if (PyUnicode_GET_LENGTH(str) == 1) {
11697 const int kind = PyUnicode_KIND(str);
11698 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011699 if (kind == PyUnicode_1BYTE_KIND) {
11700 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011701 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011702 }
11703 else if (kind == PyUnicode_2BYTE_KIND) {
11704 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011705 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011706 ucs2[n] = fill_char;
11707 } else {
11708 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11709 assert(kind == PyUnicode_4BYTE_KIND);
11710 for (n = 0; n < len; ++n)
11711 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011712 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 }
11714 else {
11715 /* number of characters copied this far */
11716 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011717 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 char *to = (char *) PyUnicode_DATA(u);
11719 Py_MEMCPY(to, PyUnicode_DATA(str),
11720 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011721 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011722 n = (done <= nchars-done) ? done : nchars-done;
11723 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011724 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726 }
11727
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011728 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011729 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730}
11731
Alexander Belopolsky40018472011-02-26 01:02:56 +000011732PyObject *
11733PyUnicode_Replace(PyObject *obj,
11734 PyObject *subobj,
11735 PyObject *replobj,
11736 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737{
11738 PyObject *self;
11739 PyObject *str1;
11740 PyObject *str2;
11741 PyObject *result;
11742
11743 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011744 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011747 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011748 Py_DECREF(self);
11749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750 }
11751 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011752 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 Py_DECREF(self);
11754 Py_DECREF(str1);
11755 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011757 if (PyUnicode_READY(self) == -1 ||
11758 PyUnicode_READY(str1) == -1 ||
11759 PyUnicode_READY(str2) == -1)
11760 result = NULL;
11761 else
11762 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763 Py_DECREF(self);
11764 Py_DECREF(str1);
11765 Py_DECREF(str2);
11766 return result;
11767}
11768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011769PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011770 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771\n\
11772Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011773old replaced by new. If the optional argument count is\n\
11774given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775
11776static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 PyObject *str1;
11780 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011781 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782 PyObject *result;
11783
Martin v. Löwis18e16552006-02-15 17:27:45 +000011784 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011786 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011789 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 return NULL;
11791 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011792 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011793 Py_DECREF(str1);
11794 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011795 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011796 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11797 result = NULL;
11798 else
11799 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800
11801 Py_DECREF(str1);
11802 Py_DECREF(str2);
11803 return result;
11804}
11805
Alexander Belopolsky40018472011-02-26 01:02:56 +000011806static PyObject *
11807unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011809 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 Py_ssize_t isize;
11811 Py_ssize_t osize, squote, dquote, i, o;
11812 Py_UCS4 max, quote;
11813 int ikind, okind;
11814 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011817 return NULL;
11818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 isize = PyUnicode_GET_LENGTH(unicode);
11820 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 /* Compute length of output, quote characters, and
11823 maximum character */
11824 osize = 2; /* quotes */
11825 max = 127;
11826 squote = dquote = 0;
11827 ikind = PyUnicode_KIND(unicode);
11828 for (i = 0; i < isize; i++) {
11829 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11830 switch (ch) {
11831 case '\'': squote++; osize++; break;
11832 case '"': dquote++; osize++; break;
11833 case '\\': case '\t': case '\r': case '\n':
11834 osize += 2; break;
11835 default:
11836 /* Fast-path ASCII */
11837 if (ch < ' ' || ch == 0x7f)
11838 osize += 4; /* \xHH */
11839 else if (ch < 0x7f)
11840 osize++;
11841 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11842 osize++;
11843 max = ch > max ? ch : max;
11844 }
11845 else if (ch < 0x100)
11846 osize += 4; /* \xHH */
11847 else if (ch < 0x10000)
11848 osize += 6; /* \uHHHH */
11849 else
11850 osize += 10; /* \uHHHHHHHH */
11851 }
11852 }
11853
11854 quote = '\'';
11855 if (squote) {
11856 if (dquote)
11857 /* Both squote and dquote present. Use squote,
11858 and escape them */
11859 osize += squote;
11860 else
11861 quote = '"';
11862 }
11863
11864 repr = PyUnicode_New(osize, max);
11865 if (repr == NULL)
11866 return NULL;
11867 okind = PyUnicode_KIND(repr);
11868 odata = PyUnicode_DATA(repr);
11869
11870 PyUnicode_WRITE(okind, odata, 0, quote);
11871 PyUnicode_WRITE(okind, odata, osize-1, quote);
11872
11873 for (i = 0, o = 1; i < isize; i++) {
11874 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011875
11876 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 if ((ch == quote) || (ch == '\\')) {
11878 PyUnicode_WRITE(okind, odata, o++, '\\');
11879 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011880 continue;
11881 }
11882
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011884 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 PyUnicode_WRITE(okind, odata, o++, '\\');
11886 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011887 }
11888 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 PyUnicode_WRITE(okind, odata, o++, '\\');
11890 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011891 }
11892 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 PyUnicode_WRITE(okind, odata, o++, '\\');
11894 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011895 }
11896
11897 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011898 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 PyUnicode_WRITE(okind, odata, o++, '\\');
11900 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011901 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11902 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011903 }
11904
Georg Brandl559e5d72008-06-11 18:37:52 +000011905 /* Copy ASCII characters as-is */
11906 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011908 }
11909
Benjamin Peterson29060642009-01-31 22:14:21 +000011910 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011911 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011912 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011913 (categories Z* and C* except ASCII space)
11914 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011916 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000011917 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011920 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11921 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011922 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011923 /* Map 16-bit characters to '\uxxxx' */
11924 else if (ch <= 0xffff) {
11925 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011926 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11927 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11928 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11929 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011930 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011931 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011932 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011933 PyUnicode_WRITE(okind, odata, o++, 'U');
11934 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11935 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11936 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11937 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020011938 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11939 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11940 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11941 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011942 }
11943 }
11944 /* Copy characters as-is */
11945 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011947 }
11948 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011951 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011952 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953}
11954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011955PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957\n\
11958Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011959such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960arguments start and end are interpreted as in slice notation.\n\
11961\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011962Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963
11964static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011967 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011968 Py_ssize_t start;
11969 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011970 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971
Jesus Ceaac451502011-04-20 17:09:23 +020011972 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11973 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011974 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 if (PyUnicode_READY(self) == -1)
11977 return NULL;
11978 if (PyUnicode_READY(substring) == -1)
11979 return NULL;
11980
Victor Stinner7931d9a2011-11-04 00:22:48 +010011981 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982
11983 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 if (result == -2)
11986 return NULL;
11987
Christian Heimes217cfd12007-12-02 14:31:20 +000011988 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989}
11990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011991PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011994Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995
11996static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011999 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012000 Py_ssize_t start;
12001 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012002 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003
Jesus Ceaac451502011-04-20 17:09:23 +020012004 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12005 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012006 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (PyUnicode_READY(self) == -1)
12009 return NULL;
12010 if (PyUnicode_READY(substring) == -1)
12011 return NULL;
12012
Victor Stinner7931d9a2011-11-04 00:22:48 +010012013 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014
12015 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 if (result == -2)
12018 return NULL;
12019
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020 if (result < 0) {
12021 PyErr_SetString(PyExc_ValueError, "substring not found");
12022 return NULL;
12023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024
Christian Heimes217cfd12007-12-02 14:31:20 +000012025 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026}
12027
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012028PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012029 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012031Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012032done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033
12034static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012035unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012037 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 Py_UCS4 fillchar = ' ';
12039
Victor Stinnere9a29352011-10-01 02:14:59 +020012040 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012042
Benjamin Petersonbac79492012-01-14 13:34:47 -050012043 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044 return NULL;
12045
Victor Stinnerc4b49542011-12-11 22:44:26 +010012046 if (PyUnicode_GET_LENGTH(self) >= width)
12047 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048
Victor Stinnerc4b49542011-12-11 22:44:26 +010012049 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050}
12051
Alexander Belopolsky40018472011-02-26 01:02:56 +000012052PyObject *
12053PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054{
12055 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012056
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057 s = PyUnicode_FromObject(s);
12058 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012059 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012060 if (sep != NULL) {
12061 sep = PyUnicode_FromObject(sep);
12062 if (sep == NULL) {
12063 Py_DECREF(s);
12064 return NULL;
12065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066 }
12067
Victor Stinner9310abb2011-10-05 00:59:23 +020012068 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069
12070 Py_DECREF(s);
12071 Py_XDECREF(sep);
12072 return result;
12073}
12074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012075PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012076 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077\n\
12078Return a list of the words in S, using sep as the\n\
12079delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012080splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012081whitespace string is a separator and empty strings are\n\
12082removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083
12084static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012085unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012087 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012089 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012091 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12092 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093 return NULL;
12094
12095 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012096 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012098 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012100 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101}
12102
Thomas Wouters477c8d52006-05-27 19:21:47 +000012103PyObject *
12104PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12105{
12106 PyObject* str_obj;
12107 PyObject* sep_obj;
12108 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 int kind1, kind2, kind;
12110 void *buf1 = NULL, *buf2 = NULL;
12111 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012112
12113 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012114 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012115 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012116 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012117 if (!sep_obj) {
12118 Py_DECREF(str_obj);
12119 return NULL;
12120 }
12121 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12122 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012123 Py_DECREF(str_obj);
12124 return NULL;
12125 }
12126
Victor Stinner14f8f022011-10-05 20:58:25 +020012127 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012129 kind = Py_MAX(kind1, kind2);
12130 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012132 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 if (!buf1)
12134 goto onError;
12135 buf2 = PyUnicode_DATA(sep_obj);
12136 if (kind2 != kind)
12137 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12138 if (!buf2)
12139 goto onError;
12140 len1 = PyUnicode_GET_LENGTH(str_obj);
12141 len2 = PyUnicode_GET_LENGTH(sep_obj);
12142
Benjamin Petersonead6b532011-12-20 17:23:42 -060012143 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012145 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12146 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12147 else
12148 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 break;
12150 case PyUnicode_2BYTE_KIND:
12151 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12152 break;
12153 case PyUnicode_4BYTE_KIND:
12154 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12155 break;
12156 default:
12157 assert(0);
12158 out = 0;
12159 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012160
12161 Py_DECREF(sep_obj);
12162 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 if (kind1 != kind)
12164 PyMem_Free(buf1);
12165 if (kind2 != kind)
12166 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012167
12168 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012169 onError:
12170 Py_DECREF(sep_obj);
12171 Py_DECREF(str_obj);
12172 if (kind1 != kind && buf1)
12173 PyMem_Free(buf1);
12174 if (kind2 != kind && buf2)
12175 PyMem_Free(buf2);
12176 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012177}
12178
12179
12180PyObject *
12181PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12182{
12183 PyObject* str_obj;
12184 PyObject* sep_obj;
12185 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 int kind1, kind2, kind;
12187 void *buf1 = NULL, *buf2 = NULL;
12188 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012189
12190 str_obj = PyUnicode_FromObject(str_in);
12191 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012192 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012193 sep_obj = PyUnicode_FromObject(sep_in);
12194 if (!sep_obj) {
12195 Py_DECREF(str_obj);
12196 return NULL;
12197 }
12198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 kind1 = PyUnicode_KIND(str_in);
12200 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012201 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 buf1 = PyUnicode_DATA(str_in);
12203 if (kind1 != kind)
12204 buf1 = _PyUnicode_AsKind(str_in, kind);
12205 if (!buf1)
12206 goto onError;
12207 buf2 = PyUnicode_DATA(sep_obj);
12208 if (kind2 != kind)
12209 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12210 if (!buf2)
12211 goto onError;
12212 len1 = PyUnicode_GET_LENGTH(str_obj);
12213 len2 = PyUnicode_GET_LENGTH(sep_obj);
12214
Benjamin Petersonead6b532011-12-20 17:23:42 -060012215 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012217 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12218 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12219 else
12220 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 break;
12222 case PyUnicode_2BYTE_KIND:
12223 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12224 break;
12225 case PyUnicode_4BYTE_KIND:
12226 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12227 break;
12228 default:
12229 assert(0);
12230 out = 0;
12231 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012232
12233 Py_DECREF(sep_obj);
12234 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235 if (kind1 != kind)
12236 PyMem_Free(buf1);
12237 if (kind2 != kind)
12238 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012239
12240 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012241 onError:
12242 Py_DECREF(sep_obj);
12243 Py_DECREF(str_obj);
12244 if (kind1 != kind && buf1)
12245 PyMem_Free(buf1);
12246 if (kind2 != kind && buf2)
12247 PyMem_Free(buf2);
12248 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012249}
12250
12251PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012253\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012254Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012255the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012256found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012257
12258static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012259unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012260{
Victor Stinner9310abb2011-10-05 00:59:23 +020012261 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012262}
12263
12264PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012265 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012266\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012267Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012268the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012269separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012270
12271static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012272unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012273{
Victor Stinner9310abb2011-10-05 00:59:23 +020012274 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012275}
12276
Alexander Belopolsky40018472011-02-26 01:02:56 +000012277PyObject *
12278PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012279{
12280 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012281
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012282 s = PyUnicode_FromObject(s);
12283 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012284 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 if (sep != NULL) {
12286 sep = PyUnicode_FromObject(sep);
12287 if (sep == NULL) {
12288 Py_DECREF(s);
12289 return NULL;
12290 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012291 }
12292
Victor Stinner9310abb2011-10-05 00:59:23 +020012293 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012294
12295 Py_DECREF(s);
12296 Py_XDECREF(sep);
12297 return result;
12298}
12299
12300PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012301 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012302\n\
12303Return a list of the words in S, using sep as the\n\
12304delimiter string, starting at the end of the string and\n\
12305working to the front. If maxsplit is given, at most maxsplit\n\
12306splits are done. If sep is not specified, any whitespace string\n\
12307is a separator.");
12308
12309static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012310unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012311{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012312 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012313 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012314 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012315
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012316 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12317 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012318 return NULL;
12319
12320 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012321 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012322 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012323 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012324 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012325 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012326}
12327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012328PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012329 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012330\n\
12331Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012332Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012333is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334
12335static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012336unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012337{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012338 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012339 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012341 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12342 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343 return NULL;
12344
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012345 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346}
12347
12348static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012349PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012351 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352}
12353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012354PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012355 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356\n\
12357Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012358and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359
12360static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012361unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012363 if (PyUnicode_READY(self) == -1)
12364 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012365 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366}
12367
Georg Brandlceee0772007-11-27 23:48:05 +000012368PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012369 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012370\n\
12371Return a translation table usable for str.translate().\n\
12372If there is only one argument, it must be a dictionary mapping Unicode\n\
12373ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012374Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012375If there are two arguments, they must be strings of equal length, and\n\
12376in the resulting dictionary, each character in x will be mapped to the\n\
12377character at the same position in y. If there is a third argument, it\n\
12378must be a string, whose characters will be mapped to None in the result.");
12379
12380static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012381unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012382{
12383 PyObject *x, *y = NULL, *z = NULL;
12384 PyObject *new = NULL, *key, *value;
12385 Py_ssize_t i = 0;
12386 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012387
Georg Brandlceee0772007-11-27 23:48:05 +000012388 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12389 return NULL;
12390 new = PyDict_New();
12391 if (!new)
12392 return NULL;
12393 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394 int x_kind, y_kind, z_kind;
12395 void *x_data, *y_data, *z_data;
12396
Georg Brandlceee0772007-11-27 23:48:05 +000012397 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012398 if (!PyUnicode_Check(x)) {
12399 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12400 "be a string if there is a second argument");
12401 goto err;
12402 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012404 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12405 "arguments must have equal length");
12406 goto err;
12407 }
12408 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 x_kind = PyUnicode_KIND(x);
12410 y_kind = PyUnicode_KIND(y);
12411 x_data = PyUnicode_DATA(x);
12412 y_data = PyUnicode_DATA(y);
12413 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12414 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012415 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012416 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012417 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012418 if (!value) {
12419 Py_DECREF(key);
12420 goto err;
12421 }
Georg Brandlceee0772007-11-27 23:48:05 +000012422 res = PyDict_SetItem(new, key, value);
12423 Py_DECREF(key);
12424 Py_DECREF(value);
12425 if (res < 0)
12426 goto err;
12427 }
12428 /* create entries for deleting chars in z */
12429 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 z_kind = PyUnicode_KIND(z);
12431 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012432 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012434 if (!key)
12435 goto err;
12436 res = PyDict_SetItem(new, key, Py_None);
12437 Py_DECREF(key);
12438 if (res < 0)
12439 goto err;
12440 }
12441 }
12442 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 int kind;
12444 void *data;
12445
Georg Brandlceee0772007-11-27 23:48:05 +000012446 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012447 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012448 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12449 "to maketrans it must be a dict");
12450 goto err;
12451 }
12452 /* copy entries into the new dict, converting string keys to int keys */
12453 while (PyDict_Next(x, &i, &key, &value)) {
12454 if (PyUnicode_Check(key)) {
12455 /* convert string keys to integer keys */
12456 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012457 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012458 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12459 "table must be of length 1");
12460 goto err;
12461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 kind = PyUnicode_KIND(key);
12463 data = PyUnicode_DATA(key);
12464 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012465 if (!newkey)
12466 goto err;
12467 res = PyDict_SetItem(new, newkey, value);
12468 Py_DECREF(newkey);
12469 if (res < 0)
12470 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012471 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012472 /* just keep integer keys */
12473 if (PyDict_SetItem(new, key, value) < 0)
12474 goto err;
12475 } else {
12476 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12477 "be strings or integers");
12478 goto err;
12479 }
12480 }
12481 }
12482 return new;
12483 err:
12484 Py_DECREF(new);
12485 return NULL;
12486}
12487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012488PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012489 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490\n\
12491Return a copy of the string S, where all characters have been mapped\n\
12492through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012493Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012494Unmapped characters are left untouched. Characters mapped to None\n\
12495are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496
12497static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501}
12502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012503PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012504 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012506Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507
12508static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012509unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012511 if (PyUnicode_READY(self) == -1)
12512 return NULL;
12513 if (PyUnicode_IS_ASCII(self))
12514 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012515 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516}
12517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012518PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012519 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012521Pad a numeric string S with zeros on the left, to fill a field\n\
12522of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523
12524static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012525unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012527 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012528 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012529 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012530 int kind;
12531 void *data;
12532 Py_UCS4 chr;
12533
Martin v. Löwis18e16552006-02-15 17:27:45 +000012534 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535 return NULL;
12536
Benjamin Petersonbac79492012-01-14 13:34:47 -050012537 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012538 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539
Victor Stinnerc4b49542011-12-11 22:44:26 +010012540 if (PyUnicode_GET_LENGTH(self) >= width)
12541 return unicode_result_unchanged(self);
12542
12543 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544
12545 u = pad(self, fill, 0, '0');
12546
Walter Dörwald068325e2002-04-15 13:36:47 +000012547 if (u == NULL)
12548 return NULL;
12549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 kind = PyUnicode_KIND(u);
12551 data = PyUnicode_DATA(u);
12552 chr = PyUnicode_READ(kind, data, fill);
12553
12554 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 PyUnicode_WRITE(kind, data, 0, chr);
12557 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558 }
12559
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012560 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012561 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563
12564#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012565static PyObject *
12566unicode__decimal2ascii(PyObject *self)
12567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012569}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570#endif
12571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012572PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012573 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012575Return True if S starts with the specified prefix, False otherwise.\n\
12576With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012577With optional end, stop comparing S at that position.\n\
12578prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579
12580static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012581unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012582 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012584 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012585 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012586 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012587 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012588 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589
Jesus Ceaac451502011-04-20 17:09:23 +020012590 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012592 if (PyTuple_Check(subobj)) {
12593 Py_ssize_t i;
12594 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012595 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012596 if (substring == NULL)
12597 return NULL;
12598 result = tailmatch(self, substring, start, end, -1);
12599 Py_DECREF(substring);
12600 if (result) {
12601 Py_RETURN_TRUE;
12602 }
12603 }
12604 /* nothing matched */
12605 Py_RETURN_FALSE;
12606 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012607 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012608 if (substring == NULL) {
12609 if (PyErr_ExceptionMatches(PyExc_TypeError))
12610 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12611 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012613 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012614 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012616 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617}
12618
12619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012620PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012621 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012623Return True if S ends with the specified suffix, False otherwise.\n\
12624With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012625With optional end, stop comparing S at that position.\n\
12626suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627
12628static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012629unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012630 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012632 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012633 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012634 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012635 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012636 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637
Jesus Ceaac451502011-04-20 17:09:23 +020012638 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012640 if (PyTuple_Check(subobj)) {
12641 Py_ssize_t i;
12642 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012643 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012644 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012645 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012646 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012647 result = tailmatch(self, substring, start, end, +1);
12648 Py_DECREF(substring);
12649 if (result) {
12650 Py_RETURN_TRUE;
12651 }
12652 }
12653 Py_RETURN_FALSE;
12654 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012655 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012656 if (substring == NULL) {
12657 if (PyErr_ExceptionMatches(PyExc_TypeError))
12658 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12659 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012661 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012662 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012664 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665}
12666
Victor Stinner202fdca2012-05-07 12:47:02 +020012667Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012668_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012669{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012670 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012671 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12672 writer->data = PyUnicode_DATA(writer->buffer);
12673 writer->kind = PyUnicode_KIND(writer->buffer);
12674}
12675
Victor Stinnerd3f08822012-05-29 12:57:52 +020012676void
12677_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012678{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012679 memset(writer, 0, sizeof(*writer));
12680#ifdef Py_DEBUG
12681 writer->kind = 5; /* invalid kind */
12682#endif
12683 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012684 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012685}
12686
Victor Stinnerd3f08822012-05-29 12:57:52 +020012687int
12688_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12689 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012690{
12691 Py_ssize_t newlen;
12692 PyObject *newbuffer;
12693
Victor Stinnerd3f08822012-05-29 12:57:52 +020012694 assert(length > 0);
12695
Victor Stinner202fdca2012-05-07 12:47:02 +020012696 if (length > PY_SSIZE_T_MAX - writer->pos) {
12697 PyErr_NoMemory();
12698 return -1;
12699 }
12700 newlen = writer->pos + length;
12701
Victor Stinnerd3f08822012-05-29 12:57:52 +020012702 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012703 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012704 /* overallocate 25% to limit the number of resize */
12705 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12706 newlen += newlen / 4;
12707 if (newlen < writer->min_length)
12708 newlen = writer->min_length;
12709 }
12710 writer->buffer = PyUnicode_New(newlen, maxchar);
12711 if (writer->buffer == NULL)
12712 return -1;
12713 _PyUnicodeWriter_Update(writer);
12714 return 0;
12715 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012716
Victor Stinnerd3f08822012-05-29 12:57:52 +020012717 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012718 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012719 /* overallocate 25% to limit the number of resize */
12720 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12721 newlen += newlen / 4;
12722 if (newlen < writer->min_length)
12723 newlen = writer->min_length;
12724 }
12725
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012726 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012727 /* resize + widen */
12728 newbuffer = PyUnicode_New(newlen, maxchar);
12729 if (newbuffer == NULL)
12730 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012731 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12732 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012733 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012734 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012735 }
12736 else {
12737 newbuffer = resize_compact(writer->buffer, newlen);
12738 if (newbuffer == NULL)
12739 return -1;
12740 }
12741 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012742 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012743 }
12744 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012745 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012746 newbuffer = PyUnicode_New(writer->size, maxchar);
12747 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012748 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012749 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12750 writer->buffer, 0, writer->pos);
12751 Py_DECREF(writer->buffer);
12752 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012753 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012754 }
12755 return 0;
12756}
12757
Victor Stinnerd3f08822012-05-29 12:57:52 +020012758int
12759_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12760{
12761 Py_UCS4 maxchar;
12762 Py_ssize_t len;
12763
12764 if (PyUnicode_READY(str) == -1)
12765 return -1;
12766 len = PyUnicode_GET_LENGTH(str);
12767 if (len == 0)
12768 return 0;
12769 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12770 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012771 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012772 Py_INCREF(str);
12773 writer->buffer = str;
12774 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012775 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012776 writer->size = 0;
12777 writer->pos += len;
12778 return 0;
12779 }
12780 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12781 return -1;
12782 }
12783 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12784 str, 0, len);
12785 writer->pos += len;
12786 return 0;
12787}
12788
Victor Stinnere215d962012-10-06 23:03:36 +020012789int
12790_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12791{
12792 Py_UCS4 maxchar;
12793
12794 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12795 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12796 return -1;
12797 unicode_write_cstr(writer->buffer, writer->pos, str, len);
12798 writer->pos += len;
12799 return 0;
12800}
12801
Victor Stinnerd3f08822012-05-29 12:57:52 +020012802PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012803_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012804{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012805 if (writer->pos == 0) {
12806 Py_XDECREF(writer->buffer);
12807 Py_INCREF(unicode_empty);
12808 return unicode_empty;
12809 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012810 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012811 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12812 return writer->buffer;
12813 }
12814 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12815 PyObject *newbuffer;
12816 newbuffer = resize_compact(writer->buffer, writer->pos);
12817 if (newbuffer == NULL) {
12818 Py_DECREF(writer->buffer);
12819 return NULL;
12820 }
12821 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012822 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012823 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012824 return writer->buffer;
12825}
12826
Victor Stinnerd3f08822012-05-29 12:57:52 +020012827void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012828_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012829{
12830 Py_CLEAR(writer->buffer);
12831}
12832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012833#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012834
12835PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012836 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012837\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012838Return a formatted version of S, using substitutions from args and kwargs.\n\
12839The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012840
Eric Smith27bbca62010-11-04 17:06:58 +000012841PyDoc_STRVAR(format_map__doc__,
12842 "S.format_map(mapping) -> str\n\
12843\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012844Return a formatted version of S, using substitutions from mapping.\n\
12845The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012846
Eric Smith4a7d76d2008-05-30 18:10:19 +000012847static PyObject *
12848unicode__format__(PyObject* self, PyObject* args)
12849{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012850 PyObject *format_spec;
12851 _PyUnicodeWriter writer;
12852 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012853
12854 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12855 return NULL;
12856
Victor Stinnerd3f08822012-05-29 12:57:52 +020012857 if (PyUnicode_READY(self) == -1)
12858 return NULL;
12859 _PyUnicodeWriter_Init(&writer, 0);
12860 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12861 self, format_spec, 0,
12862 PyUnicode_GET_LENGTH(format_spec));
12863 if (ret == -1) {
12864 _PyUnicodeWriter_Dealloc(&writer);
12865 return NULL;
12866 }
12867 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012868}
12869
Eric Smith8c663262007-08-25 02:26:07 +000012870PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012871 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012872\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012873Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012874
12875static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012876unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012878 Py_ssize_t size;
12879
12880 /* If it's a compact object, account for base structure +
12881 character data. */
12882 if (PyUnicode_IS_COMPACT_ASCII(v))
12883 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12884 else if (PyUnicode_IS_COMPACT(v))
12885 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012886 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012887 else {
12888 /* If it is a two-block object, account for base object, and
12889 for character block if present. */
12890 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012891 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012893 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012894 }
12895 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012896 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012897 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012899 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012900 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012901
12902 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012903}
12904
12905PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012906 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012907
12908static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012909unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012910{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012911 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012912 if (!copy)
12913 return NULL;
12914 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012915}
12916
Guido van Rossumd57fd912000-03-10 22:53:23 +000012917static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012918 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012919 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012920 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12921 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012922 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12923 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050012924 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012925 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12926 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12927 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12928 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12929 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012930 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012931 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12932 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12933 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012934 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012935 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12936 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12937 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012938 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012939 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012940 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012941 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012942 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12943 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12944 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12945 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12946 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12947 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12948 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12949 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12950 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12951 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12952 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12953 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12954 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12955 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012956 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012957 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012958 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012959 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012960 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012961 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012962 {"maketrans", (PyCFunction) unicode_maketrans,
12963 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012964 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012965#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012966 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012967 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012968#endif
12969
Benjamin Peterson14339b62009-01-31 16:36:08 +000012970 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971 {NULL, NULL}
12972};
12973
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012974static PyObject *
12975unicode_mod(PyObject *v, PyObject *w)
12976{
Brian Curtindfc80e32011-08-10 20:28:54 -050012977 if (!PyUnicode_Check(v))
12978 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012979 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012980}
12981
12982static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012983 0, /*nb_add*/
12984 0, /*nb_subtract*/
12985 0, /*nb_multiply*/
12986 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012987};
12988
Guido van Rossumd57fd912000-03-10 22:53:23 +000012989static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012990 (lenfunc) unicode_length, /* sq_length */
12991 PyUnicode_Concat, /* sq_concat */
12992 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12993 (ssizeargfunc) unicode_getitem, /* sq_item */
12994 0, /* sq_slice */
12995 0, /* sq_ass_item */
12996 0, /* sq_ass_slice */
12997 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012998};
12999
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013000static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013001unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013002{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003 if (PyUnicode_READY(self) == -1)
13004 return NULL;
13005
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013006 if (PyIndex_Check(item)) {
13007 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013008 if (i == -1 && PyErr_Occurred())
13009 return NULL;
13010 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013012 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013013 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013014 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013015 PyObject *result;
13016 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013017 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013018 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013020 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013021 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013022 return NULL;
13023 }
13024
13025 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013026 Py_INCREF(unicode_empty);
13027 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013028 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013029 slicelength == PyUnicode_GET_LENGTH(self)) {
13030 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013031 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013032 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013033 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013034 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013035 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013036 src_kind = PyUnicode_KIND(self);
13037 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013038 if (!PyUnicode_IS_ASCII(self)) {
13039 kind_limit = kind_maxchar_limit(src_kind);
13040 max_char = 0;
13041 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13042 ch = PyUnicode_READ(src_kind, src_data, cur);
13043 if (ch > max_char) {
13044 max_char = ch;
13045 if (max_char >= kind_limit)
13046 break;
13047 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013048 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013049 }
Victor Stinner55c99112011-10-13 01:17:06 +020013050 else
13051 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013052 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013053 if (result == NULL)
13054 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013055 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013056 dest_data = PyUnicode_DATA(result);
13057
13058 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013059 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13060 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013061 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013062 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013063 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013064 } else {
13065 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13066 return NULL;
13067 }
13068}
13069
13070static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013071 (lenfunc)unicode_length, /* mp_length */
13072 (binaryfunc)unicode_subscript, /* mp_subscript */
13073 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013074};
13075
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077/* Helpers for PyUnicode_Format() */
13078
Victor Stinnera47082312012-10-04 02:19:54 +020013079struct unicode_formatter_t {
13080 PyObject *args;
13081 int args_owned;
13082 Py_ssize_t arglen, argidx;
13083 PyObject *dict;
13084
13085 enum PyUnicode_Kind fmtkind;
13086 Py_ssize_t fmtcnt, fmtpos;
13087 void *fmtdata;
13088 PyObject *fmtstr;
13089
13090 _PyUnicodeWriter writer;
13091};
13092
13093struct unicode_format_arg_t {
13094 Py_UCS4 ch;
13095 int flags;
13096 Py_ssize_t width;
13097 int prec;
13098 int sign;
13099};
13100
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013102unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103{
Victor Stinnera47082312012-10-04 02:19:54 +020013104 Py_ssize_t argidx = ctx->argidx;
13105
13106 if (argidx < ctx->arglen) {
13107 ctx->argidx++;
13108 if (ctx->arglen < 0)
13109 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013110 else
Victor Stinnera47082312012-10-04 02:19:54 +020013111 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112 }
13113 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013114 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115 return NULL;
13116}
13117
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013118/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119
Victor Stinnera47082312012-10-04 02:19:54 +020013120/* Format a float into the writer if the writer is not NULL, or into *p_output
13121 otherwise.
13122
13123 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013124static int
Victor Stinnera47082312012-10-04 02:19:54 +020013125formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13126 PyObject **p_output,
13127 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013129 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013131 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013132 int prec;
13133 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013134
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135 x = PyFloat_AsDouble(v);
13136 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013137 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013138
Victor Stinnera47082312012-10-04 02:19:54 +020013139 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013142
Victor Stinnera47082312012-10-04 02:19:54 +020013143 if (arg->flags & F_ALT)
13144 dtoa_flags = Py_DTSF_ALT;
13145 else
13146 dtoa_flags = 0;
13147 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013148 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013149 return -1;
13150 len = strlen(p);
13151 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013152 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13153 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013154 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013155 }
Victor Stinner184252a2012-06-16 02:57:41 +020013156 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013157 writer->pos += len;
13158 }
13159 else
13160 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013161 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013162 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013163}
13164
Victor Stinnerd0880d52012-04-27 23:40:13 +020013165/* formatlong() emulates the format codes d, u, o, x and X, and
13166 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13167 * Python's regular ints.
13168 * Return value: a new PyUnicodeObject*, or NULL if error.
13169 * The output string is of the form
13170 * "-"? ("0x" | "0X")? digit+
13171 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13172 * set in flags. The case of hex digits will be correct,
13173 * There will be at least prec digits, zero-filled on the left if
13174 * necessary to get that many.
13175 * val object to be converted
13176 * flags bitmask of format flags; only F_ALT is looked at
13177 * prec minimum number of digits; 0-fill on left if needed
13178 * type a character in [duoxX]; u acts the same as d
13179 *
13180 * CAUTION: o, x and X conversions on regular ints can never
13181 * produce a '-' sign, but can for Python's unbounded ints.
13182 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013183static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013184formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013185{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013186 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013187 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013188 Py_ssize_t i;
13189 int sign; /* 1 if '-', else 0 */
13190 int len; /* number of characters */
13191 Py_ssize_t llen;
13192 int numdigits; /* len == numnondigits + numdigits */
13193 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013194 int prec = arg->prec;
13195 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013196
Victor Stinnerd0880d52012-04-27 23:40:13 +020013197 /* Avoid exceeding SSIZE_T_MAX */
13198 if (prec > INT_MAX-3) {
13199 PyErr_SetString(PyExc_OverflowError,
13200 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013201 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013202 }
13203
13204 assert(PyLong_Check(val));
13205
13206 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013207 default:
13208 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013209 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013210 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013211 case 'u':
13212 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013213 if (PyBool_Check(val))
13214 result = PyNumber_ToBase(val, 10);
13215 else
13216 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013217 break;
13218 case 'o':
13219 numnondigits = 2;
13220 result = PyNumber_ToBase(val, 8);
13221 break;
13222 case 'x':
13223 case 'X':
13224 numnondigits = 2;
13225 result = PyNumber_ToBase(val, 16);
13226 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013227 }
13228 if (!result)
13229 return NULL;
13230
13231 assert(unicode_modifiable(result));
13232 assert(PyUnicode_IS_READY(result));
13233 assert(PyUnicode_IS_ASCII(result));
13234
13235 /* To modify the string in-place, there can only be one reference. */
13236 if (Py_REFCNT(result) != 1) {
13237 PyErr_BadInternalCall();
13238 return NULL;
13239 }
13240 buf = PyUnicode_DATA(result);
13241 llen = PyUnicode_GET_LENGTH(result);
13242 if (llen > INT_MAX) {
13243 PyErr_SetString(PyExc_ValueError,
13244 "string too large in _PyBytes_FormatLong");
13245 return NULL;
13246 }
13247 len = (int)llen;
13248 sign = buf[0] == '-';
13249 numnondigits += sign;
13250 numdigits = len - numnondigits;
13251 assert(numdigits > 0);
13252
13253 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013254 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013255 (type == 'o' || type == 'x' || type == 'X'))) {
13256 assert(buf[sign] == '0');
13257 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13258 buf[sign+1] == 'o');
13259 numnondigits -= 2;
13260 buf += 2;
13261 len -= 2;
13262 if (sign)
13263 buf[0] = '-';
13264 assert(len == numnondigits + numdigits);
13265 assert(numdigits > 0);
13266 }
13267
13268 /* Fill with leading zeroes to meet minimum width. */
13269 if (prec > numdigits) {
13270 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13271 numnondigits + prec);
13272 char *b1;
13273 if (!r1) {
13274 Py_DECREF(result);
13275 return NULL;
13276 }
13277 b1 = PyBytes_AS_STRING(r1);
13278 for (i = 0; i < numnondigits; ++i)
13279 *b1++ = *buf++;
13280 for (i = 0; i < prec - numdigits; i++)
13281 *b1++ = '0';
13282 for (i = 0; i < numdigits; i++)
13283 *b1++ = *buf++;
13284 *b1 = '\0';
13285 Py_DECREF(result);
13286 result = r1;
13287 buf = PyBytes_AS_STRING(result);
13288 len = numnondigits + prec;
13289 }
13290
13291 /* Fix up case for hex conversions. */
13292 if (type == 'X') {
13293 /* Need to convert all lower case letters to upper case.
13294 and need to convert 0x to 0X (and -0x to -0X). */
13295 for (i = 0; i < len; i++)
13296 if (buf[i] >= 'a' && buf[i] <= 'x')
13297 buf[i] -= 'a'-'A';
13298 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013299 if (!PyUnicode_Check(result)
13300 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013301 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013302 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013303 Py_DECREF(result);
13304 result = unicode;
13305 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013306 else if (len != PyUnicode_GET_LENGTH(result)) {
13307 if (PyUnicode_Resize(&result, len) < 0)
13308 Py_CLEAR(result);
13309 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013310 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013311}
13312
Victor Stinner621ef3d2012-10-02 00:33:47 +020013313/* Format an integer.
13314 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013315 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013316 * -1 and raise an exception on error */
13317static int
Victor Stinnera47082312012-10-04 02:19:54 +020013318mainformatlong(PyObject *v,
13319 struct unicode_format_arg_t *arg,
13320 PyObject **p_output,
13321 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013322{
13323 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013324 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013325
13326 if (!PyNumber_Check(v))
13327 goto wrongtype;
13328
13329 if (!PyLong_Check(v)) {
13330 iobj = PyNumber_Long(v);
13331 if (iobj == NULL) {
13332 if (PyErr_ExceptionMatches(PyExc_TypeError))
13333 goto wrongtype;
13334 return -1;
13335 }
13336 assert(PyLong_Check(iobj));
13337 }
13338 else {
13339 iobj = v;
13340 Py_INCREF(iobj);
13341 }
13342
13343 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013344 && arg->width == -1 && arg->prec == -1
13345 && !(arg->flags & (F_SIGN | F_BLANK))
13346 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013347 {
13348 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013349 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013350 int base;
13351
Victor Stinnera47082312012-10-04 02:19:54 +020013352 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013353 {
13354 default:
13355 assert(0 && "'type' not in [diuoxX]");
13356 case 'd':
13357 case 'i':
13358 case 'u':
13359 base = 10;
13360 break;
13361 case 'o':
13362 base = 8;
13363 break;
13364 case 'x':
13365 case 'X':
13366 base = 16;
13367 break;
13368 }
13369
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013370 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13371 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013372 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013373 }
13374 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013375 return 1;
13376 }
13377
Victor Stinnera47082312012-10-04 02:19:54 +020013378 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013379 Py_DECREF(iobj);
13380 if (res == NULL)
13381 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013382 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013383 return 0;
13384
13385wrongtype:
13386 PyErr_Format(PyExc_TypeError,
13387 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013388 "not %.200s",
13389 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013390 return -1;
13391}
13392
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013393static Py_UCS4
13394formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013395{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013396 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013397 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013398 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013399 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013400 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013401 goto onError;
13402 }
13403 else {
13404 /* Integer input truncated to a character */
13405 long x;
13406 x = PyLong_AsLong(v);
13407 if (x == -1 && PyErr_Occurred())
13408 goto onError;
13409
Victor Stinner8faf8212011-12-08 22:14:11 +010013410 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 PyErr_SetString(PyExc_OverflowError,
13412 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013413 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013414 }
13415
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013416 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013417 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013418
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013420 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013422 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423}
13424
Victor Stinnera47082312012-10-04 02:19:54 +020013425/* Parse options of an argument: flags, width, precision.
13426 Handle also "%(name)" syntax.
13427
13428 Return 0 if the argument has been formatted into arg->str.
13429 Return 1 if the argument has been written into ctx->writer,
13430 Raise an exception and return -1 on error. */
13431static int
13432unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13433 struct unicode_format_arg_t *arg)
13434{
13435#define FORMAT_READ(ctx) \
13436 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13437
13438 PyObject *v;
13439
Victor Stinnera47082312012-10-04 02:19:54 +020013440 if (arg->ch == '(') {
13441 /* Get argument value from a dictionary. Example: "%(name)s". */
13442 Py_ssize_t keystart;
13443 Py_ssize_t keylen;
13444 PyObject *key;
13445 int pcount = 1;
13446
13447 if (ctx->dict == NULL) {
13448 PyErr_SetString(PyExc_TypeError,
13449 "format requires a mapping");
13450 return -1;
13451 }
13452 ++ctx->fmtpos;
13453 --ctx->fmtcnt;
13454 keystart = ctx->fmtpos;
13455 /* Skip over balanced parentheses */
13456 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13457 arg->ch = FORMAT_READ(ctx);
13458 if (arg->ch == ')')
13459 --pcount;
13460 else if (arg->ch == '(')
13461 ++pcount;
13462 ctx->fmtpos++;
13463 }
13464 keylen = ctx->fmtpos - keystart - 1;
13465 if (ctx->fmtcnt < 0 || pcount > 0) {
13466 PyErr_SetString(PyExc_ValueError,
13467 "incomplete format key");
13468 return -1;
13469 }
13470 key = PyUnicode_Substring(ctx->fmtstr,
13471 keystart, keystart + keylen);
13472 if (key == NULL)
13473 return -1;
13474 if (ctx->args_owned) {
13475 Py_DECREF(ctx->args);
13476 ctx->args_owned = 0;
13477 }
13478 ctx->args = PyObject_GetItem(ctx->dict, key);
13479 Py_DECREF(key);
13480 if (ctx->args == NULL)
13481 return -1;
13482 ctx->args_owned = 1;
13483 ctx->arglen = -1;
13484 ctx->argidx = -2;
13485 }
13486
13487 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013488 while (--ctx->fmtcnt >= 0) {
13489 arg->ch = FORMAT_READ(ctx);
13490 ctx->fmtpos++;
13491 switch (arg->ch) {
13492 case '-': arg->flags |= F_LJUST; continue;
13493 case '+': arg->flags |= F_SIGN; continue;
13494 case ' ': arg->flags |= F_BLANK; continue;
13495 case '#': arg->flags |= F_ALT; continue;
13496 case '0': arg->flags |= F_ZERO; continue;
13497 }
13498 break;
13499 }
13500
13501 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013502 if (arg->ch == '*') {
13503 v = unicode_format_getnextarg(ctx);
13504 if (v == NULL)
13505 return -1;
13506 if (!PyLong_Check(v)) {
13507 PyErr_SetString(PyExc_TypeError,
13508 "* wants int");
13509 return -1;
13510 }
13511 arg->width = PyLong_AsLong(v);
13512 if (arg->width == -1 && PyErr_Occurred())
13513 return -1;
13514 if (arg->width < 0) {
13515 arg->flags |= F_LJUST;
13516 arg->width = -arg->width;
13517 }
13518 if (--ctx->fmtcnt >= 0) {
13519 arg->ch = FORMAT_READ(ctx);
13520 ctx->fmtpos++;
13521 }
13522 }
13523 else if (arg->ch >= '0' && arg->ch <= '9') {
13524 arg->width = arg->ch - '0';
13525 while (--ctx->fmtcnt >= 0) {
13526 arg->ch = FORMAT_READ(ctx);
13527 ctx->fmtpos++;
13528 if (arg->ch < '0' || arg->ch > '9')
13529 break;
13530 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13531 mixing signed and unsigned comparison. Since arg->ch is between
13532 '0' and '9', casting to int is safe. */
13533 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13534 PyErr_SetString(PyExc_ValueError,
13535 "width too big");
13536 return -1;
13537 }
13538 arg->width = arg->width*10 + (arg->ch - '0');
13539 }
13540 }
13541
13542 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020013543 if (arg->ch == '.') {
13544 arg->prec = 0;
13545 if (--ctx->fmtcnt >= 0) {
13546 arg->ch = FORMAT_READ(ctx);
13547 ctx->fmtpos++;
13548 }
13549 if (arg->ch == '*') {
13550 v = unicode_format_getnextarg(ctx);
13551 if (v == NULL)
13552 return -1;
13553 if (!PyLong_Check(v)) {
13554 PyErr_SetString(PyExc_TypeError,
13555 "* wants int");
13556 return -1;
13557 }
13558 arg->prec = PyLong_AsLong(v);
13559 if (arg->prec == -1 && PyErr_Occurred())
13560 return -1;
13561 if (arg->prec < 0)
13562 arg->prec = 0;
13563 if (--ctx->fmtcnt >= 0) {
13564 arg->ch = FORMAT_READ(ctx);
13565 ctx->fmtpos++;
13566 }
13567 }
13568 else if (arg->ch >= '0' && arg->ch <= '9') {
13569 arg->prec = arg->ch - '0';
13570 while (--ctx->fmtcnt >= 0) {
13571 arg->ch = FORMAT_READ(ctx);
13572 ctx->fmtpos++;
13573 if (arg->ch < '0' || arg->ch > '9')
13574 break;
13575 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13576 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013577 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013578 return -1;
13579 }
13580 arg->prec = arg->prec*10 + (arg->ch - '0');
13581 }
13582 }
13583 }
13584
13585 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13586 if (ctx->fmtcnt >= 0) {
13587 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13588 if (--ctx->fmtcnt >= 0) {
13589 arg->ch = FORMAT_READ(ctx);
13590 ctx->fmtpos++;
13591 }
13592 }
13593 }
13594 if (ctx->fmtcnt < 0) {
13595 PyErr_SetString(PyExc_ValueError,
13596 "incomplete format");
13597 return -1;
13598 }
13599 return 0;
13600
13601#undef FORMAT_READ
13602}
13603
13604/* Format one argument. Supported conversion specifiers:
13605
13606 - "s", "r", "a": any type
13607 - "i", "d", "u", "o", "x", "X": int
13608 - "e", "E", "f", "F", "g", "G": float
13609 - "c": int or str (1 character)
13610
Victor Stinner8dbd4212012-12-04 09:30:24 +010013611 When possible, the output is written directly into the Unicode writer
13612 (ctx->writer). A string is created when padding is required.
13613
Victor Stinnera47082312012-10-04 02:19:54 +020013614 Return 0 if the argument has been formatted into *p_str,
13615 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010013616 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020013617static int
13618unicode_format_arg_format(struct unicode_formatter_t *ctx,
13619 struct unicode_format_arg_t *arg,
13620 PyObject **p_str)
13621{
13622 PyObject *v;
13623 _PyUnicodeWriter *writer = &ctx->writer;
13624
13625 if (ctx->fmtcnt == 0)
13626 ctx->writer.overallocate = 0;
13627
13628 if (arg->ch == '%') {
13629 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13630 return -1;
13631 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13632 writer->pos += 1;
13633 return 1;
13634 }
13635
13636 v = unicode_format_getnextarg(ctx);
13637 if (v == NULL)
13638 return -1;
13639
Victor Stinnera47082312012-10-04 02:19:54 +020013640
13641 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020013642 case 's':
13643 case 'r':
13644 case 'a':
13645 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13646 /* Fast path */
13647 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13648 return -1;
13649 return 1;
13650 }
13651
13652 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13653 *p_str = v;
13654 Py_INCREF(*p_str);
13655 }
13656 else {
13657 if (arg->ch == 's')
13658 *p_str = PyObject_Str(v);
13659 else if (arg->ch == 'r')
13660 *p_str = PyObject_Repr(v);
13661 else
13662 *p_str = PyObject_ASCII(v);
13663 }
13664 break;
13665
13666 case 'i':
13667 case 'd':
13668 case 'u':
13669 case 'o':
13670 case 'x':
13671 case 'X':
13672 {
13673 int ret = mainformatlong(v, arg, p_str, writer);
13674 if (ret != 0)
13675 return ret;
13676 arg->sign = 1;
13677 break;
13678 }
13679
13680 case 'e':
13681 case 'E':
13682 case 'f':
13683 case 'F':
13684 case 'g':
13685 case 'G':
13686 if (arg->width == -1 && arg->prec == -1
13687 && !(arg->flags & (F_SIGN | F_BLANK)))
13688 {
13689 /* Fast path */
13690 if (formatfloat(v, arg, NULL, writer) == -1)
13691 return -1;
13692 return 1;
13693 }
13694
13695 arg->sign = 1;
13696 if (formatfloat(v, arg, p_str, NULL) == -1)
13697 return -1;
13698 break;
13699
13700 case 'c':
13701 {
13702 Py_UCS4 ch = formatchar(v);
13703 if (ch == (Py_UCS4) -1)
13704 return -1;
13705 if (arg->width == -1 && arg->prec == -1) {
13706 /* Fast path */
13707 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13708 return -1;
13709 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13710 writer->pos += 1;
13711 return 1;
13712 }
13713 *p_str = PyUnicode_FromOrdinal(ch);
13714 break;
13715 }
13716
13717 default:
13718 PyErr_Format(PyExc_ValueError,
13719 "unsupported format character '%c' (0x%x) "
13720 "at index %zd",
13721 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13722 (int)arg->ch,
13723 ctx->fmtpos - 1);
13724 return -1;
13725 }
13726 if (*p_str == NULL)
13727 return -1;
13728 assert (PyUnicode_Check(*p_str));
13729 return 0;
13730}
13731
13732static int
13733unicode_format_arg_output(struct unicode_formatter_t *ctx,
13734 struct unicode_format_arg_t *arg,
13735 PyObject *str)
13736{
13737 Py_ssize_t len;
13738 enum PyUnicode_Kind kind;
13739 void *pbuf;
13740 Py_ssize_t pindex;
13741 Py_UCS4 signchar;
13742 Py_ssize_t buflen;
13743 Py_UCS4 maxchar, bufmaxchar;
13744 Py_ssize_t sublen;
13745 _PyUnicodeWriter *writer = &ctx->writer;
13746 Py_UCS4 fill;
13747
13748 fill = ' ';
13749 if (arg->sign && arg->flags & F_ZERO)
13750 fill = '0';
13751
13752 if (PyUnicode_READY(str) == -1)
13753 return -1;
13754
13755 len = PyUnicode_GET_LENGTH(str);
13756 if ((arg->width == -1 || arg->width <= len)
13757 && (arg->prec == -1 || arg->prec >= len)
13758 && !(arg->flags & (F_SIGN | F_BLANK)))
13759 {
13760 /* Fast path */
13761 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13762 return -1;
13763 return 0;
13764 }
13765
13766 /* Truncate the string for "s", "r" and "a" formats
13767 if the precision is set */
13768 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13769 if (arg->prec >= 0 && len > arg->prec)
13770 len = arg->prec;
13771 }
13772
13773 /* Adjust sign and width */
13774 kind = PyUnicode_KIND(str);
13775 pbuf = PyUnicode_DATA(str);
13776 pindex = 0;
13777 signchar = '\0';
13778 if (arg->sign) {
13779 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13780 if (ch == '-' || ch == '+') {
13781 signchar = ch;
13782 len--;
13783 pindex++;
13784 }
13785 else if (arg->flags & F_SIGN)
13786 signchar = '+';
13787 else if (arg->flags & F_BLANK)
13788 signchar = ' ';
13789 else
13790 arg->sign = 0;
13791 }
13792 if (arg->width < len)
13793 arg->width = len;
13794
13795 /* Prepare the writer */
13796 bufmaxchar = 127;
13797 if (!(arg->flags & F_LJUST)) {
13798 if (arg->sign) {
13799 if ((arg->width-1) > len)
13800 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13801 }
13802 else {
13803 if (arg->width > len)
13804 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13805 }
13806 }
13807 maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13808 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13809 buflen = arg->width;
13810 if (arg->sign && len == arg->width)
13811 buflen++;
13812 if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1)
13813 return -1;
13814
13815 /* Write the sign if needed */
13816 if (arg->sign) {
13817 if (fill != ' ') {
13818 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13819 writer->pos += 1;
13820 }
13821 if (arg->width > len)
13822 arg->width--;
13823 }
13824
13825 /* Write the numeric prefix for "x", "X" and "o" formats
13826 if the alternate form is used.
13827 For example, write "0x" for the "%#x" format. */
13828 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13829 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13830 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
13831 if (fill != ' ') {
13832 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13833 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13834 writer->pos += 2;
13835 pindex += 2;
13836 }
13837 arg->width -= 2;
13838 if (arg->width < 0)
13839 arg->width = 0;
13840 len -= 2;
13841 }
13842
13843 /* Pad left with the fill character if needed */
13844 if (arg->width > len && !(arg->flags & F_LJUST)) {
13845 sublen = arg->width - len;
13846 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
13847 writer->pos += sublen;
13848 arg->width = len;
13849 }
13850
13851 /* If padding with spaces: write sign if needed and/or numeric prefix if
13852 the alternate form is used */
13853 if (fill == ' ') {
13854 if (arg->sign) {
13855 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13856 writer->pos += 1;
13857 }
13858 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13859 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13860 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
13861 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13862 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13863 writer->pos += 2;
13864 pindex += 2;
13865 }
13866 }
13867
13868 /* Write characters */
13869 if (len) {
13870 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13871 str, pindex, len);
13872 writer->pos += len;
13873 }
13874
13875 /* Pad right with the fill character if needed */
13876 if (arg->width > len) {
13877 sublen = arg->width - len;
13878 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
13879 writer->pos += sublen;
13880 }
13881 return 0;
13882}
13883
13884/* Helper of PyUnicode_Format(): format one arg.
13885 Return 0 on success, raise an exception and return -1 on error. */
13886static int
13887unicode_format_arg(struct unicode_formatter_t *ctx)
13888{
13889 struct unicode_format_arg_t arg;
13890 PyObject *str;
13891 int ret;
13892
Victor Stinner8dbd4212012-12-04 09:30:24 +010013893 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
13894 arg.flags = 0;
13895 arg.width = -1;
13896 arg.prec = -1;
13897 arg.sign = 0;
13898 str = NULL;
13899
Victor Stinnera47082312012-10-04 02:19:54 +020013900 ret = unicode_format_arg_parse(ctx, &arg);
13901 if (ret == -1)
13902 return -1;
13903
13904 ret = unicode_format_arg_format(ctx, &arg, &str);
13905 if (ret == -1)
13906 return -1;
13907
13908 if (ret != 1) {
13909 ret = unicode_format_arg_output(ctx, &arg, str);
13910 Py_DECREF(str);
13911 if (ret == -1)
13912 return -1;
13913 }
13914
13915 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
13916 PyErr_SetString(PyExc_TypeError,
13917 "not all arguments converted during string formatting");
13918 return -1;
13919 }
13920 return 0;
13921}
13922
Alexander Belopolsky40018472011-02-26 01:02:56 +000013923PyObject *
13924PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013925{
Victor Stinnera47082312012-10-04 02:19:54 +020013926 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000013927
Guido van Rossumd57fd912000-03-10 22:53:23 +000013928 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013929 PyErr_BadInternalCall();
13930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013931 }
Victor Stinnera47082312012-10-04 02:19:54 +020013932
13933 ctx.fmtstr = PyUnicode_FromObject(format);
13934 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013935 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020013936 if (PyUnicode_READY(ctx.fmtstr) == -1) {
13937 Py_DECREF(ctx.fmtstr);
13938 return NULL;
13939 }
13940 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
13941 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
13942 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
13943 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013944
Victor Stinnera47082312012-10-04 02:19:54 +020013945 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013946
Guido van Rossumd57fd912000-03-10 22:53:23 +000013947 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020013948 ctx.arglen = PyTuple_Size(args);
13949 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013950 }
13951 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013952 ctx.arglen = -1;
13953 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013954 }
Victor Stinnera47082312012-10-04 02:19:54 +020013955 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013956 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020013957 ctx.dict = args;
13958 else
13959 ctx.dict = NULL;
13960 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013961
Victor Stinnera47082312012-10-04 02:19:54 +020013962 while (--ctx.fmtcnt >= 0) {
13963 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13964 Py_ssize_t nonfmtpos, sublen;
13965 Py_UCS4 maxchar;
13966
13967 nonfmtpos = ctx.fmtpos++;
13968 while (ctx.fmtcnt >= 0 &&
13969 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13970 ctx.fmtpos++;
13971 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013972 }
Victor Stinnera47082312012-10-04 02:19:54 +020013973 if (ctx.fmtcnt < 0) {
13974 ctx.fmtpos--;
13975 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020013976 }
Victor Stinnera47082312012-10-04 02:19:54 +020013977 sublen = ctx.fmtpos - nonfmtpos;
13978 maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr,
Victor Stinneree4544c2012-05-09 22:24:08 +020013979 nonfmtpos, nonfmtpos + sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020013980 if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013981 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013982
Victor Stinnera47082312012-10-04 02:19:54 +020013983 _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos,
13984 ctx.fmtstr, nonfmtpos, sublen);
13985 ctx.writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013986 }
13987 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013988 ctx.fmtpos++;
13989 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013990 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020013991 }
13992 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013993
Victor Stinnera47082312012-10-04 02:19:54 +020013994 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013995 PyErr_SetString(PyExc_TypeError,
13996 "not all arguments converted during string formatting");
13997 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013998 }
13999
Victor Stinnera47082312012-10-04 02:19:54 +020014000 if (ctx.args_owned) {
14001 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014002 }
Victor Stinnera47082312012-10-04 02:19:54 +020014003 Py_DECREF(ctx.fmtstr);
14004 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014005
Benjamin Peterson29060642009-01-31 22:14:21 +000014006 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014007 Py_DECREF(ctx.fmtstr);
14008 _PyUnicodeWriter_Dealloc(&ctx.writer);
14009 if (ctx.args_owned) {
14010 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014011 }
14012 return NULL;
14013}
14014
Jeremy Hylton938ace62002-07-17 16:30:39 +000014015static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014016unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14017
Tim Peters6d6c1a32001-08-02 04:15:00 +000014018static PyObject *
14019unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14020{
Benjamin Peterson29060642009-01-31 22:14:21 +000014021 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014022 static char *kwlist[] = {"object", "encoding", "errors", 0};
14023 char *encoding = NULL;
14024 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014025
Benjamin Peterson14339b62009-01-31 16:36:08 +000014026 if (type != &PyUnicode_Type)
14027 return unicode_subtype_new(type, args, kwds);
14028 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014029 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014030 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014031 if (x == NULL) {
14032 Py_INCREF(unicode_empty);
14033 return unicode_empty;
14034 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014035 if (encoding == NULL && errors == NULL)
14036 return PyObject_Str(x);
14037 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014038 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014039}
14040
Guido van Rossume023fe02001-08-30 03:12:59 +000014041static PyObject *
14042unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14043{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014044 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014045 Py_ssize_t length, char_size;
14046 int share_wstr, share_utf8;
14047 unsigned int kind;
14048 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014049
Benjamin Peterson14339b62009-01-31 16:36:08 +000014050 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014051
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014052 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014053 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014054 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014055 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014056 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014057 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014058 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014059 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014060
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014061 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014062 if (self == NULL) {
14063 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014064 return NULL;
14065 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014066 kind = PyUnicode_KIND(unicode);
14067 length = PyUnicode_GET_LENGTH(unicode);
14068
14069 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014070#ifdef Py_DEBUG
14071 _PyUnicode_HASH(self) = -1;
14072#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014073 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014074#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014075 _PyUnicode_STATE(self).interned = 0;
14076 _PyUnicode_STATE(self).kind = kind;
14077 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014078 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014079 _PyUnicode_STATE(self).ready = 1;
14080 _PyUnicode_WSTR(self) = NULL;
14081 _PyUnicode_UTF8_LENGTH(self) = 0;
14082 _PyUnicode_UTF8(self) = NULL;
14083 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014084 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014085
14086 share_utf8 = 0;
14087 share_wstr = 0;
14088 if (kind == PyUnicode_1BYTE_KIND) {
14089 char_size = 1;
14090 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14091 share_utf8 = 1;
14092 }
14093 else if (kind == PyUnicode_2BYTE_KIND) {
14094 char_size = 2;
14095 if (sizeof(wchar_t) == 2)
14096 share_wstr = 1;
14097 }
14098 else {
14099 assert(kind == PyUnicode_4BYTE_KIND);
14100 char_size = 4;
14101 if (sizeof(wchar_t) == 4)
14102 share_wstr = 1;
14103 }
14104
14105 /* Ensure we won't overflow the length. */
14106 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14107 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014108 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014109 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014110 data = PyObject_MALLOC((length + 1) * char_size);
14111 if (data == NULL) {
14112 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014113 goto onError;
14114 }
14115
Victor Stinnerc3c74152011-10-02 20:39:55 +020014116 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014117 if (share_utf8) {
14118 _PyUnicode_UTF8_LENGTH(self) = length;
14119 _PyUnicode_UTF8(self) = data;
14120 }
14121 if (share_wstr) {
14122 _PyUnicode_WSTR_LENGTH(self) = length;
14123 _PyUnicode_WSTR(self) = (wchar_t *)data;
14124 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014125
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014126 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014127 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014128 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014129#ifdef Py_DEBUG
14130 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14131#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014132 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014133 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014134
14135onError:
14136 Py_DECREF(unicode);
14137 Py_DECREF(self);
14138 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014139}
14140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014141PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014142"str(object='') -> str\n\
14143str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014144\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014145Create a new string object from the given object. If encoding or\n\
14146errors is specified, then the object must expose a data buffer\n\
14147that will be decoded using the given encoding and error handler.\n\
14148Otherwise, returns the result of object.__str__() (if defined)\n\
14149or repr(object).\n\
14150encoding defaults to sys.getdefaultencoding().\n\
14151errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014152
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014153static PyObject *unicode_iter(PyObject *seq);
14154
Guido van Rossumd57fd912000-03-10 22:53:23 +000014155PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014156 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014157 "str", /* tp_name */
14158 sizeof(PyUnicodeObject), /* tp_size */
14159 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014160 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014161 (destructor)unicode_dealloc, /* tp_dealloc */
14162 0, /* tp_print */
14163 0, /* tp_getattr */
14164 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014165 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014166 unicode_repr, /* tp_repr */
14167 &unicode_as_number, /* tp_as_number */
14168 &unicode_as_sequence, /* tp_as_sequence */
14169 &unicode_as_mapping, /* tp_as_mapping */
14170 (hashfunc) unicode_hash, /* tp_hash*/
14171 0, /* tp_call*/
14172 (reprfunc) unicode_str, /* tp_str */
14173 PyObject_GenericGetAttr, /* tp_getattro */
14174 0, /* tp_setattro */
14175 0, /* tp_as_buffer */
14176 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014177 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014178 unicode_doc, /* tp_doc */
14179 0, /* tp_traverse */
14180 0, /* tp_clear */
14181 PyUnicode_RichCompare, /* tp_richcompare */
14182 0, /* tp_weaklistoffset */
14183 unicode_iter, /* tp_iter */
14184 0, /* tp_iternext */
14185 unicode_methods, /* tp_methods */
14186 0, /* tp_members */
14187 0, /* tp_getset */
14188 &PyBaseObject_Type, /* tp_base */
14189 0, /* tp_dict */
14190 0, /* tp_descr_get */
14191 0, /* tp_descr_set */
14192 0, /* tp_dictoffset */
14193 0, /* tp_init */
14194 0, /* tp_alloc */
14195 unicode_new, /* tp_new */
14196 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014197};
14198
14199/* Initialize the Unicode implementation */
14200
Victor Stinner3a50e702011-10-18 21:21:00 +020014201int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014202{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014203 int i;
14204
Thomas Wouters477c8d52006-05-27 19:21:47 +000014205 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014206 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014207 0x000A, /* LINE FEED */
14208 0x000D, /* CARRIAGE RETURN */
14209 0x001C, /* FILE SEPARATOR */
14210 0x001D, /* GROUP SEPARATOR */
14211 0x001E, /* RECORD SEPARATOR */
14212 0x0085, /* NEXT LINE */
14213 0x2028, /* LINE SEPARATOR */
14214 0x2029, /* PARAGRAPH SEPARATOR */
14215 };
14216
Fred Drakee4315f52000-05-09 19:53:39 +000014217 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014218 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014219 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014220 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014221 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014222
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014223 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014224 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014225 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014226 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014227
14228 /* initialize the linebreak bloom filter */
14229 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014230 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014231 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014232
14233 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014234
Benjamin Petersonc4311282012-10-30 23:21:10 -040014235 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14236 Py_FatalError("Can't initialize field name iterator type");
14237
14238 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14239 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014240
Victor Stinner3a50e702011-10-18 21:21:00 +020014241#ifdef HAVE_MBCS
14242 winver.dwOSVersionInfoSize = sizeof(winver);
14243 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14244 PyErr_SetFromWindowsErr(0);
14245 return -1;
14246 }
14247#endif
14248 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014249}
14250
14251/* Finalize the Unicode implementation */
14252
Christian Heimesa156e092008-02-16 07:38:31 +000014253int
14254PyUnicode_ClearFreeList(void)
14255{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014256 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014257}
14258
Guido van Rossumd57fd912000-03-10 22:53:23 +000014259void
Thomas Wouters78890102000-07-22 19:25:51 +000014260_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014261{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014262 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014263
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014264 Py_XDECREF(unicode_empty);
14265 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014266
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014267 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014268 if (unicode_latin1[i]) {
14269 Py_DECREF(unicode_latin1[i]);
14270 unicode_latin1[i] = NULL;
14271 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014272 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014273 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014274 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014275}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014276
Walter Dörwald16807132007-05-25 13:52:07 +000014277void
14278PyUnicode_InternInPlace(PyObject **p)
14279{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014280 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014281 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014282#ifdef Py_DEBUG
14283 assert(s != NULL);
14284 assert(_PyUnicode_CHECK(s));
14285#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014286 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014287 return;
14288#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014289 /* If it's a subclass, we don't really know what putting
14290 it in the interned dict might do. */
14291 if (!PyUnicode_CheckExact(s))
14292 return;
14293 if (PyUnicode_CHECK_INTERNED(s))
14294 return;
14295 if (interned == NULL) {
14296 interned = PyDict_New();
14297 if (interned == NULL) {
14298 PyErr_Clear(); /* Don't leave an exception */
14299 return;
14300 }
14301 }
14302 /* It might be that the GetItem call fails even
14303 though the key is present in the dictionary,
14304 namely when this happens during a stack overflow. */
14305 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014306 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014307 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014308
Benjamin Peterson29060642009-01-31 22:14:21 +000014309 if (t) {
14310 Py_INCREF(t);
14311 Py_DECREF(*p);
14312 *p = t;
14313 return;
14314 }
Walter Dörwald16807132007-05-25 13:52:07 +000014315
Benjamin Peterson14339b62009-01-31 16:36:08 +000014316 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014317 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014318 PyErr_Clear();
14319 PyThreadState_GET()->recursion_critical = 0;
14320 return;
14321 }
14322 PyThreadState_GET()->recursion_critical = 0;
14323 /* The two references in interned are not counted by refcnt.
14324 The deallocator will take care of this */
14325 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014326 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014327}
14328
14329void
14330PyUnicode_InternImmortal(PyObject **p)
14331{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014332 PyUnicode_InternInPlace(p);
14333 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014334 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014335 Py_INCREF(*p);
14336 }
Walter Dörwald16807132007-05-25 13:52:07 +000014337}
14338
14339PyObject *
14340PyUnicode_InternFromString(const char *cp)
14341{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014342 PyObject *s = PyUnicode_FromString(cp);
14343 if (s == NULL)
14344 return NULL;
14345 PyUnicode_InternInPlace(&s);
14346 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014347}
14348
Alexander Belopolsky40018472011-02-26 01:02:56 +000014349void
14350_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014351{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014352 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014353 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014354 Py_ssize_t i, n;
14355 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014356
Benjamin Peterson14339b62009-01-31 16:36:08 +000014357 if (interned == NULL || !PyDict_Check(interned))
14358 return;
14359 keys = PyDict_Keys(interned);
14360 if (keys == NULL || !PyList_Check(keys)) {
14361 PyErr_Clear();
14362 return;
14363 }
Walter Dörwald16807132007-05-25 13:52:07 +000014364
Benjamin Peterson14339b62009-01-31 16:36:08 +000014365 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14366 detector, interned unicode strings are not forcibly deallocated;
14367 rather, we give them their stolen references back, and then clear
14368 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014369
Benjamin Peterson14339b62009-01-31 16:36:08 +000014370 n = PyList_GET_SIZE(keys);
14371 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014372 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014373 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014374 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014375 if (PyUnicode_READY(s) == -1) {
14376 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014377 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014379 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014380 case SSTATE_NOT_INTERNED:
14381 /* XXX Shouldn't happen */
14382 break;
14383 case SSTATE_INTERNED_IMMORTAL:
14384 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014385 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014386 break;
14387 case SSTATE_INTERNED_MORTAL:
14388 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014389 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014390 break;
14391 default:
14392 Py_FatalError("Inconsistent interned string state.");
14393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014394 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014395 }
14396 fprintf(stderr, "total size of all interned strings: "
14397 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14398 "mortal/immortal\n", mortal_size, immortal_size);
14399 Py_DECREF(keys);
14400 PyDict_Clear(interned);
14401 Py_DECREF(interned);
14402 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014403}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014404
14405
14406/********************* Unicode Iterator **************************/
14407
14408typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014409 PyObject_HEAD
14410 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014411 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014412} unicodeiterobject;
14413
14414static void
14415unicodeiter_dealloc(unicodeiterobject *it)
14416{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014417 _PyObject_GC_UNTRACK(it);
14418 Py_XDECREF(it->it_seq);
14419 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014420}
14421
14422static int
14423unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14424{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014425 Py_VISIT(it->it_seq);
14426 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014427}
14428
14429static PyObject *
14430unicodeiter_next(unicodeiterobject *it)
14431{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014432 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014433
Benjamin Peterson14339b62009-01-31 16:36:08 +000014434 assert(it != NULL);
14435 seq = it->it_seq;
14436 if (seq == NULL)
14437 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014438 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014439
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014440 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14441 int kind = PyUnicode_KIND(seq);
14442 void *data = PyUnicode_DATA(seq);
14443 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14444 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014445 if (item != NULL)
14446 ++it->it_index;
14447 return item;
14448 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014449
Benjamin Peterson14339b62009-01-31 16:36:08 +000014450 Py_DECREF(seq);
14451 it->it_seq = NULL;
14452 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014453}
14454
14455static PyObject *
14456unicodeiter_len(unicodeiterobject *it)
14457{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014458 Py_ssize_t len = 0;
14459 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014460 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014461 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014462}
14463
14464PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14465
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014466static PyObject *
14467unicodeiter_reduce(unicodeiterobject *it)
14468{
14469 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014470 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014471 it->it_seq, it->it_index);
14472 } else {
14473 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14474 if (u == NULL)
14475 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014476 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014477 }
14478}
14479
14480PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14481
14482static PyObject *
14483unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14484{
14485 Py_ssize_t index = PyLong_AsSsize_t(state);
14486 if (index == -1 && PyErr_Occurred())
14487 return NULL;
14488 if (index < 0)
14489 index = 0;
14490 it->it_index = index;
14491 Py_RETURN_NONE;
14492}
14493
14494PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14495
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014496static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014497 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014498 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014499 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14500 reduce_doc},
14501 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14502 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014503 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014504};
14505
14506PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014507 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14508 "str_iterator", /* tp_name */
14509 sizeof(unicodeiterobject), /* tp_basicsize */
14510 0, /* tp_itemsize */
14511 /* methods */
14512 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14513 0, /* tp_print */
14514 0, /* tp_getattr */
14515 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014516 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014517 0, /* tp_repr */
14518 0, /* tp_as_number */
14519 0, /* tp_as_sequence */
14520 0, /* tp_as_mapping */
14521 0, /* tp_hash */
14522 0, /* tp_call */
14523 0, /* tp_str */
14524 PyObject_GenericGetAttr, /* tp_getattro */
14525 0, /* tp_setattro */
14526 0, /* tp_as_buffer */
14527 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14528 0, /* tp_doc */
14529 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14530 0, /* tp_clear */
14531 0, /* tp_richcompare */
14532 0, /* tp_weaklistoffset */
14533 PyObject_SelfIter, /* tp_iter */
14534 (iternextfunc)unicodeiter_next, /* tp_iternext */
14535 unicodeiter_methods, /* tp_methods */
14536 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014537};
14538
14539static PyObject *
14540unicode_iter(PyObject *seq)
14541{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014542 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014543
Benjamin Peterson14339b62009-01-31 16:36:08 +000014544 if (!PyUnicode_Check(seq)) {
14545 PyErr_BadInternalCall();
14546 return NULL;
14547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014548 if (PyUnicode_READY(seq) == -1)
14549 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014550 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14551 if (it == NULL)
14552 return NULL;
14553 it->it_index = 0;
14554 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014555 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014556 _PyObject_GC_TRACK(it);
14557 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014558}
14559
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014560
14561size_t
14562Py_UNICODE_strlen(const Py_UNICODE *u)
14563{
14564 int res = 0;
14565 while(*u++)
14566 res++;
14567 return res;
14568}
14569
14570Py_UNICODE*
14571Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14572{
14573 Py_UNICODE *u = s1;
14574 while ((*u++ = *s2++));
14575 return s1;
14576}
14577
14578Py_UNICODE*
14579Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14580{
14581 Py_UNICODE *u = s1;
14582 while ((*u++ = *s2++))
14583 if (n-- == 0)
14584 break;
14585 return s1;
14586}
14587
14588Py_UNICODE*
14589Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14590{
14591 Py_UNICODE *u1 = s1;
14592 u1 += Py_UNICODE_strlen(u1);
14593 Py_UNICODE_strcpy(u1, s2);
14594 return s1;
14595}
14596
14597int
14598Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14599{
14600 while (*s1 && *s2 && *s1 == *s2)
14601 s1++, s2++;
14602 if (*s1 && *s2)
14603 return (*s1 < *s2) ? -1 : +1;
14604 if (*s1)
14605 return 1;
14606 if (*s2)
14607 return -1;
14608 return 0;
14609}
14610
14611int
14612Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14613{
14614 register Py_UNICODE u1, u2;
14615 for (; n != 0; n--) {
14616 u1 = *s1;
14617 u2 = *s2;
14618 if (u1 != u2)
14619 return (u1 < u2) ? -1 : +1;
14620 if (u1 == '\0')
14621 return 0;
14622 s1++;
14623 s2++;
14624 }
14625 return 0;
14626}
14627
14628Py_UNICODE*
14629Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14630{
14631 const Py_UNICODE *p;
14632 for (p = s; *p; p++)
14633 if (*p == c)
14634 return (Py_UNICODE*)p;
14635 return NULL;
14636}
14637
14638Py_UNICODE*
14639Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14640{
14641 const Py_UNICODE *p;
14642 p = s + Py_UNICODE_strlen(s);
14643 while (p != s) {
14644 p--;
14645 if (*p == c)
14646 return (Py_UNICODE*)p;
14647 }
14648 return NULL;
14649}
Victor Stinner331ea922010-08-10 16:37:20 +000014650
Victor Stinner71133ff2010-09-01 23:43:53 +000014651Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014652PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014653{
Victor Stinner577db2c2011-10-11 22:12:48 +020014654 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014655 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014657 if (!PyUnicode_Check(unicode)) {
14658 PyErr_BadArgument();
14659 return NULL;
14660 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014661 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014662 if (u == NULL)
14663 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014664 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014665 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014666 PyErr_NoMemory();
14667 return NULL;
14668 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014669 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014670 size *= sizeof(Py_UNICODE);
14671 copy = PyMem_Malloc(size);
14672 if (copy == NULL) {
14673 PyErr_NoMemory();
14674 return NULL;
14675 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014676 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014677 return copy;
14678}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014679
Georg Brandl66c221e2010-10-14 07:04:07 +000014680/* A _string module, to export formatter_parser and formatter_field_name_split
14681 to the string.Formatter class implemented in Python. */
14682
14683static PyMethodDef _string_methods[] = {
14684 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14685 METH_O, PyDoc_STR("split the argument as a field name")},
14686 {"formatter_parser", (PyCFunction) formatter_parser,
14687 METH_O, PyDoc_STR("parse the argument as a format string")},
14688 {NULL, NULL}
14689};
14690
14691static struct PyModuleDef _string_module = {
14692 PyModuleDef_HEAD_INIT,
14693 "_string",
14694 PyDoc_STR("string helper module"),
14695 0,
14696 _string_methods,
14697 NULL,
14698 NULL,
14699 NULL,
14700 NULL
14701};
14702
14703PyMODINIT_FUNC
14704PyInit__string(void)
14705{
14706 return PyModule_Create(&_string_module);
14707}
14708
14709
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014710#ifdef __cplusplus
14711}
14712#endif