blob: 362f2cffcf12c6d71d7c34bda82e11982c52e639 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
52 The globals are initialized by the _PyUnicode_Init() API and should
53 not be used before calling that API.
54
55*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000057
58#ifdef __cplusplus
59extern "C" {
60#endif
61
Victor Stinner8faf8212011-12-08 22:14:11 +010062/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
63#define MAX_UNICODE 0x10ffff
64
Victor Stinner910337b2011-10-03 03:20:16 +020065#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020066# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020067#else
68# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
69#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020070
Victor Stinnere90fe6a2011-10-01 16:48:13 +020071#define _PyUnicode_UTF8(op) \
72 (((PyCompactUnicodeObject*)(op))->utf8)
73#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020074 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020075 assert(PyUnicode_IS_READY(op)), \
76 PyUnicode_IS_COMPACT_ASCII(op) ? \
77 ((char*)((PyASCIIObject*)(op) + 1)) : \
78 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020079#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080 (((PyCompactUnicodeObject*)(op))->utf8_length)
81#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((PyASCIIObject*)(op))->length : \
86 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020087#define _PyUnicode_WSTR(op) \
88 (((PyASCIIObject*)(op))->wstr)
89#define _PyUnicode_WSTR_LENGTH(op) \
90 (((PyCompactUnicodeObject*)(op))->wstr_length)
91#define _PyUnicode_LENGTH(op) \
92 (((PyASCIIObject *)(op))->length)
93#define _PyUnicode_STATE(op) \
94 (((PyASCIIObject *)(op))->state)
95#define _PyUnicode_HASH(op) \
96 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020097#define _PyUnicode_KIND(op) \
98 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020099 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#define _PyUnicode_GET_LENGTH(op) \
101 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200102 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200103#define _PyUnicode_DATA_ANY(op) \
104 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105
Victor Stinnere6abb482012-05-02 01:15:40 +0200106/* Optimized version of Py_MAX() to compute the maximum character:
107 use it when your are computing the second argument of PyUnicode_New() */
108#define MAX_MAXCHAR(maxchar1, maxchar2) \
109 ((maxchar1) | (maxchar2))
110
Victor Stinner910337b2011-10-03 03:20:16 +0200111#undef PyUnicode_READY
112#define PyUnicode_READY(op) \
113 (assert(_PyUnicode_CHECK(op)), \
114 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200115 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100116 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200117
Victor Stinnerc379ead2011-10-03 12:52:27 +0200118#define _PyUnicode_SHARE_UTF8(op) \
119 (assert(_PyUnicode_CHECK(op)), \
120 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
121 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
122#define _PyUnicode_SHARE_WSTR(op) \
123 (assert(_PyUnicode_CHECK(op)), \
124 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
125
Victor Stinner829c0ad2011-10-03 01:08:02 +0200126/* true if the Unicode object has an allocated UTF-8 memory block
127 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_HAS_UTF8_MEMORY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (!PyUnicode_IS_COMPACT_ASCII(op) \
131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (_PyUnicode_WSTR(op) && \
139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200149 to_type *_to = (to_type *) to; \
150 const from_type *_iter = (begin); \
151 const from_type *_end = (end); \
152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Walter Dörwald16807132007-05-25 13:52:07 +0000166/* This dictionary holds all interned unicode strings. Note that references
167 to strings in this dictionary are *not* counted in the string's ob_refcnt.
168 When the interned string reaches a refcnt of 0 the string deallocation
169 function will delete the reference from this dictionary.
170
171 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000172 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000173*/
174static PyObject *interned;
175
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000176/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200177static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200179/* List of static strings. */
180static _Py_Identifier *static_strings;
181
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000182/* Single character Unicode strings in the Latin-1 range are being
183 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200184static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185
Christian Heimes190d79e2008-01-30 11:58:22 +0000186/* Fast detection of the most frequent whitespace characters */
187const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000188 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000190/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000C: * FORM FEED */
193/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 1, 1, 1, 1, 1, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* case 0x001C: * FILE SEPARATOR */
197/* case 0x001D: * GROUP SEPARATOR */
198/* case 0x001E: * RECORD SEPARATOR */
199/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 1, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000206
Benjamin Peterson14339b62009-01-31 16:36:08 +0000207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000215};
216
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200217/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200218static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100220static int unicode_modifiable(PyObject *unicode);
221
Victor Stinnerfe226c02011-10-03 03:52:20 +0200222
Alexander Belopolsky40018472011-02-26 01:02:56 +0000223static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200224_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
225static PyObject *
226_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
227static PyObject *
228_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
229
230static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000232 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100233 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000234 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
235
Alexander Belopolsky40018472011-02-26 01:02:56 +0000236static void
237raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300238 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100239 PyObject *unicode,
240 Py_ssize_t startpos, Py_ssize_t endpos,
241 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000242
Christian Heimes190d79e2008-01-30 11:58:22 +0000243/* Same for linebreaks */
244static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000247/* 0x000B, * LINE TABULATION */
248/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000249/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000250 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000252/* 0x001C, * FILE SEPARATOR */
253/* 0x001D, * GROUP SEPARATOR */
254/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000255 0, 0, 0, 0, 1, 1, 1, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000260
Benjamin Peterson14339b62009-01-31 16:36:08 +0000261 0, 0, 0, 0, 0, 0, 0, 0,
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000269};
270
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300271/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
272 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000273Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000274PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000275{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000276#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000278#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 /* This is actually an illegal character, so it should
280 not be passed to unichr. */
281 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282#endif
283}
284
Victor Stinner910337b2011-10-03 03:20:16 +0200285#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200286int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100287_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200288{
289 PyASCIIObject *ascii;
290 unsigned int kind;
291
292 assert(PyUnicode_Check(op));
293
294 ascii = (PyASCIIObject *)op;
295 kind = ascii->state.kind;
296
Victor Stinnera3b334d2011-10-03 13:53:37 +0200297 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200298 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200299 assert(ascii->state.ready == 1);
300 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200301 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200302 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200303 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200304
Victor Stinnera41463c2011-10-04 01:05:08 +0200305 if (ascii->state.compact == 1) {
306 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND
308 || kind == PyUnicode_2BYTE_KIND
309 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200311 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200312 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100313 }
314 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
316
317 data = unicode->data.any;
318 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100319 assert(ascii->length == 0);
320 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert(ascii->state.compact == 0);
322 assert(ascii->state.ascii == 0);
323 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100324 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 assert(ascii->wstr != NULL);
326 assert(data == NULL);
327 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 }
329 else {
330 assert(kind == PyUnicode_1BYTE_KIND
331 || kind == PyUnicode_2BYTE_KIND
332 || kind == PyUnicode_4BYTE_KIND);
333 assert(ascii->state.compact == 0);
334 assert(ascii->state.ready == 1);
335 assert(data != NULL);
336 if (ascii->state.ascii) {
337 assert (compact->utf8 == data);
338 assert (compact->utf8_length == ascii->length);
339 }
340 else
341 assert (compact->utf8 != data);
342 }
343 }
344 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 if (
346#if SIZEOF_WCHAR_T == 2
347 kind == PyUnicode_2BYTE_KIND
348#else
349 kind == PyUnicode_4BYTE_KIND
350#endif
351 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 {
353 assert(ascii->wstr == data);
354 assert(compact->wstr_length == ascii->length);
355 } else
356 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200357 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200358
359 if (compact->utf8 == NULL)
360 assert(compact->utf8_length == 0);
361 if (ascii->wstr == NULL)
362 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200364 /* check that the best kind is used */
365 if (check_content && kind != PyUnicode_WCHAR_KIND)
366 {
367 Py_ssize_t i;
368 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200369 void *data;
370 Py_UCS4 ch;
371
372 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 for (i=0; i < ascii->length; i++)
374 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200375 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200376 if (ch > maxchar)
377 maxchar = ch;
378 }
379 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100380 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100382 assert(maxchar <= 255);
383 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 else
385 assert(maxchar < 128);
386 }
Victor Stinner77faf692011-11-20 18:56:05 +0100387 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100389 assert(maxchar <= 0xFFFF);
390 }
391 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100393 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100394 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200395 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400397 return 1;
398}
Victor Stinner910337b2011-10-03 03:20:16 +0200399#endif
400
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100401static PyObject*
402unicode_result_wchar(PyObject *unicode)
403{
404#ifndef Py_DEBUG
405 Py_ssize_t len;
406
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100407 len = _PyUnicode_WSTR_LENGTH(unicode);
408 if (len == 0) {
409 Py_INCREF(unicode_empty);
410 Py_DECREF(unicode);
411 return unicode_empty;
412 }
413
414 if (len == 1) {
415 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
416 if (ch < 256) {
417 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
418 Py_DECREF(unicode);
419 return latin1_char;
420 }
421 }
422
423 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200424 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425 return NULL;
426 }
427#else
Victor Stinneraa771272012-10-04 02:32:58 +0200428 assert(Py_REFCNT(unicode) == 1);
429
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100430 /* don't make the result ready in debug mode to ensure that the caller
431 makes the string ready before using it */
432 assert(_PyUnicode_CheckConsistency(unicode, 1));
433#endif
434 return unicode;
435}
436
437static PyObject*
438unicode_result_ready(PyObject *unicode)
439{
440 Py_ssize_t length;
441
442 length = PyUnicode_GET_LENGTH(unicode);
443 if (length == 0) {
444 if (unicode != unicode_empty) {
445 Py_INCREF(unicode_empty);
446 Py_DECREF(unicode);
447 }
448 return unicode_empty;
449 }
450
451 if (length == 1) {
452 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
453 if (ch < 256) {
454 PyObject *latin1_char = unicode_latin1[ch];
455 if (latin1_char != NULL) {
456 if (unicode != latin1_char) {
457 Py_INCREF(latin1_char);
458 Py_DECREF(unicode);
459 }
460 return latin1_char;
461 }
462 else {
463 assert(_PyUnicode_CheckConsistency(unicode, 1));
464 Py_INCREF(unicode);
465 unicode_latin1[ch] = unicode;
466 return unicode;
467 }
468 }
469 }
470
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 return unicode;
473}
474
475static PyObject*
476unicode_result(PyObject *unicode)
477{
478 assert(_PyUnicode_CHECK(unicode));
479 if (PyUnicode_IS_READY(unicode))
480 return unicode_result_ready(unicode);
481 else
482 return unicode_result_wchar(unicode);
483}
484
Victor Stinnerc4b49542011-12-11 22:44:26 +0100485static PyObject*
486unicode_result_unchanged(PyObject *unicode)
487{
488 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500489 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490 return NULL;
491 Py_INCREF(unicode);
492 return unicode;
493 }
494 else
495 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100496 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100497}
498
Victor Stinner3a50e702011-10-18 21:21:00 +0200499#ifdef HAVE_MBCS
500static OSVERSIONINFOEX winver;
501#endif
502
Thomas Wouters477c8d52006-05-27 19:21:47 +0000503/* --- Bloom Filters ----------------------------------------------------- */
504
505/* stuff to implement simple "bloom filters" for Unicode characters.
506 to keep things simple, we use a single bitmask, using the least 5
507 bits from each unicode characters as the bit index. */
508
509/* the linebreak mask is set up by Unicode_Init below */
510
Antoine Pitrouf068f942010-01-13 14:19:12 +0000511#if LONG_BIT >= 128
512#define BLOOM_WIDTH 128
513#elif LONG_BIT >= 64
514#define BLOOM_WIDTH 64
515#elif LONG_BIT >= 32
516#define BLOOM_WIDTH 32
517#else
518#error "LONG_BIT is smaller than 32"
519#endif
520
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521#define BLOOM_MASK unsigned long
522
523static BLOOM_MASK bloom_linebreak;
524
Antoine Pitrouf068f942010-01-13 14:19:12 +0000525#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
526#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000527
Benjamin Peterson29060642009-01-31 22:14:21 +0000528#define BLOOM_LINEBREAK(ch) \
529 ((ch) < 128U ? ascii_linebreak[(ch)] : \
530 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000531
Alexander Belopolsky40018472011-02-26 01:02:56 +0000532Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534{
535 /* calculate simple bloom-style bitmask for a given unicode string */
536
Antoine Pitrouf068f942010-01-13 14:19:12 +0000537 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538 Py_ssize_t i;
539
540 mask = 0;
541 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543
544 return mask;
545}
546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547#define BLOOM_MEMBER(mask, chr, str) \
548 (BLOOM(mask, chr) \
549 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000550
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200551/* Compilation of templated routines */
552
553#include "stringlib/asciilib.h"
554#include "stringlib/fastsearch.h"
555#include "stringlib/partition.h"
556#include "stringlib/split.h"
557#include "stringlib/count.h"
558#include "stringlib/find.h"
559#include "stringlib/find_max_char.h"
560#include "stringlib/localeutil.h"
561#include "stringlib/undef.h"
562
563#include "stringlib/ucs1lib.h"
564#include "stringlib/fastsearch.h"
565#include "stringlib/partition.h"
566#include "stringlib/split.h"
567#include "stringlib/count.h"
568#include "stringlib/find.h"
569#include "stringlib/find_max_char.h"
570#include "stringlib/localeutil.h"
571#include "stringlib/undef.h"
572
573#include "stringlib/ucs2lib.h"
574#include "stringlib/fastsearch.h"
575#include "stringlib/partition.h"
576#include "stringlib/split.h"
577#include "stringlib/count.h"
578#include "stringlib/find.h"
579#include "stringlib/find_max_char.h"
580#include "stringlib/localeutil.h"
581#include "stringlib/undef.h"
582
583#include "stringlib/ucs4lib.h"
584#include "stringlib/fastsearch.h"
585#include "stringlib/partition.h"
586#include "stringlib/split.h"
587#include "stringlib/count.h"
588#include "stringlib/find.h"
589#include "stringlib/find_max_char.h"
590#include "stringlib/localeutil.h"
591#include "stringlib/undef.h"
592
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200593#include "stringlib/unicodedefs.h"
594#include "stringlib/fastsearch.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100597#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599/* --- Unicode Object ----------------------------------------------------- */
600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200601static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200602fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200603
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200604Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
605 Py_ssize_t size, Py_UCS4 ch,
606 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200608 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
609
610 switch (kind) {
611 case PyUnicode_1BYTE_KIND:
612 {
613 Py_UCS1 ch1 = (Py_UCS1) ch;
614 if (ch1 == ch)
615 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
616 else
617 return -1;
618 }
619 case PyUnicode_2BYTE_KIND:
620 {
621 Py_UCS2 ch2 = (Py_UCS2) ch;
622 if (ch2 == ch)
623 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_4BYTE_KIND:
628 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
629 default:
630 assert(0);
631 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633}
634
Victor Stinnerafffce42012-10-03 23:03:17 +0200635#ifdef Py_DEBUG
636/* Fill the data of an Unicode string with invalid characters to detect bugs
637 earlier.
638
639 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
640 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
641 invalid character in Unicode 6.0. */
642static void
643unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
644{
645 int kind = PyUnicode_KIND(unicode);
646 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
647 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
648 if (length <= old_length)
649 return;
650 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
651}
652#endif
653
Victor Stinnerfe226c02011-10-03 03:52:20 +0200654static PyObject*
655resize_compact(PyObject *unicode, Py_ssize_t length)
656{
657 Py_ssize_t char_size;
658 Py_ssize_t struct_size;
659 Py_ssize_t new_size;
660 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100661 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200662#ifdef Py_DEBUG
663 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
664#endif
665
Victor Stinner79891572012-05-03 13:43:07 +0200666 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100668 assert(PyUnicode_IS_COMPACT(unicode));
669
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200670 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100671 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200672 struct_size = sizeof(PyASCIIObject);
673 else
674 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200675 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
678 PyErr_NoMemory();
679 return NULL;
680 }
681 new_size = (struct_size + (length + 1) * char_size);
682
Victor Stinner84def372011-12-11 20:04:56 +0100683 _Py_DEC_REFTOTAL;
684 _Py_ForgetReference(unicode);
685
686 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
687 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100688 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200689 PyErr_NoMemory();
690 return NULL;
691 }
Victor Stinner84def372011-12-11 20:04:56 +0100692 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200696 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100698 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200699 _PyUnicode_WSTR_LENGTH(unicode) = length;
700 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200701#ifdef Py_DEBUG
702 unicode_fill_invalid(unicode, old_length);
703#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
705 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200706 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200707 return unicode;
708}
709
Alexander Belopolsky40018472011-02-26 01:02:56 +0000710static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200711resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712{
Victor Stinner95663112011-10-04 01:03:50 +0200713 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100714 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 if (PyUnicode_IS_READY(unicode)) {
719 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200720 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200722#ifdef Py_DEBUG
723 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
724#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725
726 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200727 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200728 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
729 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730
731 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
732 PyErr_NoMemory();
733 return -1;
734 }
735 new_size = (length + 1) * char_size;
736
Victor Stinner7a9105a2011-12-12 00:13:42 +0100737 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
738 {
739 PyObject_DEL(_PyUnicode_UTF8(unicode));
740 _PyUnicode_UTF8(unicode) = NULL;
741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
742 }
743
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744 data = (PyObject *)PyObject_REALLOC(data, new_size);
745 if (data == NULL) {
746 PyErr_NoMemory();
747 return -1;
748 }
749 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200750 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200751 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200752 _PyUnicode_WSTR_LENGTH(unicode) = length;
753 }
754 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200755 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200756 _PyUnicode_UTF8_LENGTH(unicode) = length;
757 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200758 _PyUnicode_LENGTH(unicode) = length;
759 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200760#ifdef Py_DEBUG
761 unicode_fill_invalid(unicode, old_length);
762#endif
Victor Stinner95663112011-10-04 01:03:50 +0200763 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200764 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 }
Victor Stinner95663112011-10-04 01:03:50 +0200768 assert(_PyUnicode_WSTR(unicode) != NULL);
769
770 /* check for integer overflow */
771 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
772 PyErr_NoMemory();
773 return -1;
774 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100775 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200776 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100777 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200778 if (!wstr) {
779 PyErr_NoMemory();
780 return -1;
781 }
782 _PyUnicode_WSTR(unicode) = wstr;
783 _PyUnicode_WSTR(unicode)[length] = 0;
784 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200785 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000786 return 0;
787}
788
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789static PyObject*
790resize_copy(PyObject *unicode, Py_ssize_t length)
791{
792 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100793 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100795
Benjamin Petersonbac79492012-01-14 13:34:47 -0500796 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100797 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200798
799 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
800 if (copy == NULL)
801 return NULL;
802
803 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200804 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200805 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200806 }
807 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200808 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100809
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200810 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 if (w == NULL)
812 return NULL;
813 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
814 copy_length = Py_MIN(copy_length, length);
815 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
816 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200817 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200818 }
819}
820
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000822 Ux0000 terminated; some code (e.g. new_identifier)
823 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824
825 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000827
828*/
829
Alexander Belopolsky40018472011-02-26 01:02:56 +0000830static PyUnicodeObject *
831_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832{
833 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200834 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835
Thomas Wouters477c8d52006-05-27 19:21:47 +0000836 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000837 if (length == 0 && unicode_empty != NULL) {
838 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200839 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840 }
841
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000842 /* Ensure we won't overflow the size. */
843 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
844 return (PyUnicodeObject *)PyErr_NoMemory();
845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 if (length < 0) {
847 PyErr_SetString(PyExc_SystemError,
848 "Negative size passed to _PyUnicode_New");
849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000850 }
851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
853 if (unicode == NULL)
854 return NULL;
855 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
856 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
857 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100858 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000859 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100860 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200862
Jeremy Hyltond8082792003-09-16 19:41:39 +0000863 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000864 * the caller fails before initializing str -- unicode_resize()
865 * reads str[0], and the Keep-Alive optimization can keep memory
866 * allocated for str alive across a call to unicode_dealloc(unicode).
867 * We don't want unicode_resize to read uninitialized memory in
868 * that case.
869 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870 _PyUnicode_WSTR(unicode)[0] = 0;
871 _PyUnicode_WSTR(unicode)[length] = 0;
872 _PyUnicode_WSTR_LENGTH(unicode) = length;
873 _PyUnicode_HASH(unicode) = -1;
874 _PyUnicode_STATE(unicode).interned = 0;
875 _PyUnicode_STATE(unicode).kind = 0;
876 _PyUnicode_STATE(unicode).compact = 0;
877 _PyUnicode_STATE(unicode).ready = 0;
878 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200879 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200881 _PyUnicode_UTF8(unicode) = NULL;
882 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100883 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 return unicode;
885}
886
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887static const char*
888unicode_kind_name(PyObject *unicode)
889{
Victor Stinner42dfd712011-10-03 14:41:45 +0200890 /* don't check consistency: unicode_kind_name() is called from
891 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 if (!PyUnicode_IS_COMPACT(unicode))
893 {
894 if (!PyUnicode_IS_READY(unicode))
895 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600896 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 {
898 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200899 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 return "legacy ascii";
901 else
902 return "legacy latin1";
903 case PyUnicode_2BYTE_KIND:
904 return "legacy UCS2";
905 case PyUnicode_4BYTE_KIND:
906 return "legacy UCS4";
907 default:
908 return "<legacy invalid kind>";
909 }
910 }
911 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600912 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200913 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200914 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200915 return "ascii";
916 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200917 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200918 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200919 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200920 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200921 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200922 default:
923 return "<invalid compact kind>";
924 }
925}
926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928/* Functions wrapping macros for use in debugger */
929char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200930 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931}
932
933void *_PyUnicode_compact_data(void *unicode) {
934 return _PyUnicode_COMPACT_DATA(unicode);
935}
936void *_PyUnicode_data(void *unicode){
937 printf("obj %p\n", unicode);
938 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
939 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
940 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
941 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
942 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
943 return PyUnicode_DATA(unicode);
944}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200945
946void
947_PyUnicode_Dump(PyObject *op)
948{
949 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
951 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
952 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200953
Victor Stinnera849a4b2011-10-03 12:12:11 +0200954 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200955 {
956 if (ascii->state.ascii)
957 data = (ascii + 1);
958 else
959 data = (compact + 1);
960 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200961 else
962 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200963 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
964
Victor Stinnera849a4b2011-10-03 12:12:11 +0200965 if (ascii->wstr == data)
966 printf("shared ");
967 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200968
Victor Stinnera3b334d2011-10-03 13:53:37 +0200969 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200970 printf(" (%zu), ", compact->wstr_length);
971 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
972 printf("shared ");
973 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200974 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200975 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200976}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977#endif
978
979PyObject *
980PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
981{
982 PyObject *obj;
983 PyCompactUnicodeObject *unicode;
984 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200985 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200986 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 Py_ssize_t char_size;
988 Py_ssize_t struct_size;
989
990 /* Optimization for empty strings */
991 if (size == 0 && unicode_empty != NULL) {
992 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200993 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 }
995
Victor Stinner9e9d6892011-10-04 01:02:02 +0200996 is_ascii = 0;
997 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200998 struct_size = sizeof(PyCompactUnicodeObject);
999 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001000 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 char_size = 1;
1002 is_ascii = 1;
1003 struct_size = sizeof(PyASCIIObject);
1004 }
1005 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001006 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007 char_size = 1;
1008 }
1009 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001010 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001011 char_size = 2;
1012 if (sizeof(wchar_t) == 2)
1013 is_sharing = 1;
1014 }
1015 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001016 if (maxchar > MAX_UNICODE) {
1017 PyErr_SetString(PyExc_SystemError,
1018 "invalid maximum character passed to PyUnicode_New");
1019 return NULL;
1020 }
Victor Stinner8f825062012-04-27 13:55:39 +02001021 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001022 char_size = 4;
1023 if (sizeof(wchar_t) == 4)
1024 is_sharing = 1;
1025 }
1026
1027 /* Ensure we won't overflow the size. */
1028 if (size < 0) {
1029 PyErr_SetString(PyExc_SystemError,
1030 "Negative size passed to PyUnicode_New");
1031 return NULL;
1032 }
1033 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1034 return PyErr_NoMemory();
1035
1036 /* Duplicated allocation code from _PyObject_New() instead of a call to
1037 * PyObject_New() so we are able to allocate space for the object and
1038 * it's data buffer.
1039 */
1040 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1041 if (obj == NULL)
1042 return PyErr_NoMemory();
1043 obj = PyObject_INIT(obj, &PyUnicode_Type);
1044 if (obj == NULL)
1045 return NULL;
1046
1047 unicode = (PyCompactUnicodeObject *)obj;
1048 if (is_ascii)
1049 data = ((PyASCIIObject*)obj) + 1;
1050 else
1051 data = unicode + 1;
1052 _PyUnicode_LENGTH(unicode) = size;
1053 _PyUnicode_HASH(unicode) = -1;
1054 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001055 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 _PyUnicode_STATE(unicode).compact = 1;
1057 _PyUnicode_STATE(unicode).ready = 1;
1058 _PyUnicode_STATE(unicode).ascii = is_ascii;
1059 if (is_ascii) {
1060 ((char*)data)[size] = 0;
1061 _PyUnicode_WSTR(unicode) = NULL;
1062 }
Victor Stinner8f825062012-04-27 13:55:39 +02001063 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 ((char*)data)[size] = 0;
1065 _PyUnicode_WSTR(unicode) = NULL;
1066 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001068 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 else {
1071 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001072 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001073 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001075 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001076 ((Py_UCS4*)data)[size] = 0;
1077 if (is_sharing) {
1078 _PyUnicode_WSTR_LENGTH(unicode) = size;
1079 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1080 }
1081 else {
1082 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1083 _PyUnicode_WSTR(unicode) = NULL;
1084 }
1085 }
Victor Stinner8f825062012-04-27 13:55:39 +02001086#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001087 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001088#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001089 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 return obj;
1091}
1092
1093#if SIZEOF_WCHAR_T == 2
1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1095 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001096 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097
1098 This function assumes that unicode can hold one more code point than wstr
1099 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001100static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103{
1104 const wchar_t *iter;
1105 Py_UCS4 *ucs4_out;
1106
Victor Stinner910337b2011-10-03 03:20:16 +02001107 assert(unicode != NULL);
1108 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1110 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1111
1112 for (iter = begin; iter < end; ) {
1113 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1114 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001115 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1116 && (iter+1) < end
1117 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 {
Victor Stinner551ac952011-11-29 22:58:13 +01001119 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 iter += 2;
1121 }
1122 else {
1123 *ucs4_out++ = *iter;
1124 iter++;
1125 }
1126 }
1127 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1128 _PyUnicode_GET_LENGTH(unicode)));
1129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130}
1131#endif
1132
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133static int
Victor Stinner488fa492011-12-12 00:01:39 +01001134unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001135{
Victor Stinner488fa492011-12-12 00:01:39 +01001136 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001137 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001138 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return -1;
1140 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141 return 0;
1142}
1143
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144static int
1145_copy_characters(PyObject *to, Py_ssize_t to_start,
1146 PyObject *from, Py_ssize_t from_start,
1147 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 unsigned int from_kind, to_kind;
1150 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151
Victor Stinneree4544c2012-05-09 22:24:08 +02001152 assert(0 <= how_many);
1153 assert(0 <= from_start);
1154 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001155 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001157 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158
Victor Stinnerd3f08822012-05-29 12:57:52 +02001159 assert(PyUnicode_Check(to));
1160 assert(PyUnicode_IS_READY(to));
1161 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1162
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001163 if (how_many == 0)
1164 return 0;
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001167 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170
Victor Stinnerf1852262012-06-16 16:38:26 +02001171#ifdef Py_DEBUG
1172 if (!check_maxchar
1173 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1174 {
1175 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1176 Py_UCS4 ch;
1177 Py_ssize_t i;
1178 for (i=0; i < how_many; i++) {
1179 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1180 assert(ch <= to_maxchar);
1181 }
1182 }
1183#endif
1184
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001185 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001186 if (check_maxchar
1187 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1188 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001189 /* Writing Latin-1 characters into an ASCII string requires to
1190 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001191 Py_UCS4 max_char;
1192 max_char = ucs1lib_find_max_char(from_data,
1193 (Py_UCS1*)from_data + how_many);
1194 if (max_char >= 128)
1195 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001196 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001197 Py_MEMCPY((char*)to_data + to_kind * to_start,
1198 (char*)from_data + from_kind * from_start,
1199 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001201 else if (from_kind == PyUnicode_1BYTE_KIND
1202 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 {
1204 _PyUnicode_CONVERT_BYTES(
1205 Py_UCS1, Py_UCS2,
1206 PyUnicode_1BYTE_DATA(from) + from_start,
1207 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1208 PyUnicode_2BYTE_DATA(to) + to_start
1209 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001210 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001211 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001212 && to_kind == PyUnicode_4BYTE_KIND)
1213 {
1214 _PyUnicode_CONVERT_BYTES(
1215 Py_UCS1, Py_UCS4,
1216 PyUnicode_1BYTE_DATA(from) + from_start,
1217 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1218 PyUnicode_4BYTE_DATA(to) + to_start
1219 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001220 }
1221 else if (from_kind == PyUnicode_2BYTE_KIND
1222 && to_kind == PyUnicode_4BYTE_KIND)
1223 {
1224 _PyUnicode_CONVERT_BYTES(
1225 Py_UCS2, Py_UCS4,
1226 PyUnicode_2BYTE_DATA(from) + from_start,
1227 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1228 PyUnicode_4BYTE_DATA(to) + to_start
1229 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001230 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001231 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001232 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1233
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001234 if (!check_maxchar) {
1235 if (from_kind == PyUnicode_2BYTE_KIND
1236 && to_kind == PyUnicode_1BYTE_KIND)
1237 {
1238 _PyUnicode_CONVERT_BYTES(
1239 Py_UCS2, Py_UCS1,
1240 PyUnicode_2BYTE_DATA(from) + from_start,
1241 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1242 PyUnicode_1BYTE_DATA(to) + to_start
1243 );
1244 }
1245 else if (from_kind == PyUnicode_4BYTE_KIND
1246 && to_kind == PyUnicode_1BYTE_KIND)
1247 {
1248 _PyUnicode_CONVERT_BYTES(
1249 Py_UCS4, Py_UCS1,
1250 PyUnicode_4BYTE_DATA(from) + from_start,
1251 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1252 PyUnicode_1BYTE_DATA(to) + to_start
1253 );
1254 }
1255 else if (from_kind == PyUnicode_4BYTE_KIND
1256 && to_kind == PyUnicode_2BYTE_KIND)
1257 {
1258 _PyUnicode_CONVERT_BYTES(
1259 Py_UCS4, Py_UCS2,
1260 PyUnicode_4BYTE_DATA(from) + from_start,
1261 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1262 PyUnicode_2BYTE_DATA(to) + to_start
1263 );
1264 }
1265 else {
1266 assert(0);
1267 return -1;
1268 }
1269 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001270 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001271 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001272 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001273 Py_ssize_t i;
1274
Victor Stinnera0702ab2011-09-29 14:14:38 +02001275 for (i=0; i < how_many; i++) {
1276 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001277 if (ch > to_maxchar)
1278 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001279 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1280 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001281 }
1282 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001283 return 0;
1284}
1285
Victor Stinnerd3f08822012-05-29 12:57:52 +02001286void
1287_PyUnicode_FastCopyCharacters(
1288 PyObject *to, Py_ssize_t to_start,
1289 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001290{
1291 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1292}
1293
1294Py_ssize_t
1295PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1296 PyObject *from, Py_ssize_t from_start,
1297 Py_ssize_t how_many)
1298{
1299 int err;
1300
1301 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1302 PyErr_BadInternalCall();
1303 return -1;
1304 }
1305
Benjamin Petersonbac79492012-01-14 13:34:47 -05001306 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001307 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001308 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001309 return -1;
1310
Victor Stinnerd3f08822012-05-29 12:57:52 +02001311 if (from_start < 0) {
1312 PyErr_SetString(PyExc_IndexError, "string index out of range");
1313 return -1;
1314 }
1315 if (to_start < 0) {
1316 PyErr_SetString(PyExc_IndexError, "string index out of range");
1317 return -1;
1318 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001319 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1320 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1321 PyErr_Format(PyExc_SystemError,
1322 "Cannot write %zi characters at %zi "
1323 "in a string of %zi characters",
1324 how_many, to_start, PyUnicode_GET_LENGTH(to));
1325 return -1;
1326 }
1327
1328 if (how_many == 0)
1329 return 0;
1330
Victor Stinner488fa492011-12-12 00:01:39 +01001331 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001332 return -1;
1333
1334 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1335 if (err) {
1336 PyErr_Format(PyExc_SystemError,
1337 "Cannot copy %s characters "
1338 "into a string of %s characters",
1339 unicode_kind_name(from),
1340 unicode_kind_name(to));
1341 return -1;
1342 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001343 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344}
1345
Victor Stinner17222162011-09-28 22:15:37 +02001346/* Find the maximum code point and count the number of surrogate pairs so a
1347 correct string length can be computed before converting a string to UCS4.
1348 This function counts single surrogates as a character and not as a pair.
1349
1350 Return 0 on success, or -1 on error. */
1351static int
1352find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1353 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354{
1355 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001356 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357
Victor Stinnerc53be962011-10-02 21:33:54 +02001358 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 *num_surrogates = 0;
1360 *maxchar = 0;
1361
1362 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001364 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1365 && (iter+1) < end
1366 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001368 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 iter += 2;
1371 }
1372 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001374 {
1375 ch = *iter;
1376 iter++;
1377 }
1378 if (ch > *maxchar) {
1379 *maxchar = ch;
1380 if (*maxchar > MAX_UNICODE) {
1381 PyErr_Format(PyExc_ValueError,
1382 "character U+%x is not in range [U+0000; U+10ffff]",
1383 ch);
1384 return -1;
1385 }
1386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 }
1388 return 0;
1389}
1390
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001391int
1392_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393{
1394 wchar_t *end;
1395 Py_UCS4 maxchar = 0;
1396 Py_ssize_t num_surrogates;
1397#if SIZEOF_WCHAR_T == 2
1398 Py_ssize_t length_wo_surrogates;
1399#endif
1400
Georg Brandl7597add2011-10-05 16:36:47 +02001401 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001402 strings were created using _PyObject_New() and where no canonical
1403 representation (the str field) has been set yet aka strings
1404 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001405 assert(_PyUnicode_CHECK(unicode));
1406 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001408 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001409 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001410 /* Actually, it should neither be interned nor be anything else: */
1411 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001414 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001415 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417
1418 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001419 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1420 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 PyErr_NoMemory();
1422 return -1;
1423 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001424 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 _PyUnicode_WSTR(unicode), end,
1426 PyUnicode_1BYTE_DATA(unicode));
1427 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1428 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1429 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1430 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001431 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001432 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001433 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 }
1435 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001436 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001437 _PyUnicode_UTF8(unicode) = NULL;
1438 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 }
1440 PyObject_FREE(_PyUnicode_WSTR(unicode));
1441 _PyUnicode_WSTR(unicode) = NULL;
1442 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1443 }
1444 /* In this case we might have to convert down from 4-byte native
1445 wchar_t to 2-byte unicode. */
1446 else if (maxchar < 65536) {
1447 assert(num_surrogates == 0 &&
1448 "FindMaxCharAndNumSurrogatePairs() messed up");
1449
Victor Stinner506f5922011-09-28 22:34:18 +02001450#if SIZEOF_WCHAR_T == 2
1451 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001452 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001453 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1454 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1455 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 _PyUnicode_UTF8(unicode) = NULL;
1457 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001458#else
1459 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001460 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001461 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001462 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001463 PyErr_NoMemory();
1464 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 }
Victor Stinner506f5922011-09-28 22:34:18 +02001466 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1467 _PyUnicode_WSTR(unicode), end,
1468 PyUnicode_2BYTE_DATA(unicode));
1469 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1470 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1471 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001472 _PyUnicode_UTF8(unicode) = NULL;
1473 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001474 PyObject_FREE(_PyUnicode_WSTR(unicode));
1475 _PyUnicode_WSTR(unicode) = NULL;
1476 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1477#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 }
1479 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1480 else {
1481#if SIZEOF_WCHAR_T == 2
1482 /* in case the native representation is 2-bytes, we need to allocate a
1483 new normalized 4-byte version. */
1484 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001485 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1486 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 PyErr_NoMemory();
1488 return -1;
1489 }
1490 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1491 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001492 _PyUnicode_UTF8(unicode) = NULL;
1493 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001494 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1495 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001496 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 PyObject_FREE(_PyUnicode_WSTR(unicode));
1498 _PyUnicode_WSTR(unicode) = NULL;
1499 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1500#else
1501 assert(num_surrogates == 0);
1502
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001505 _PyUnicode_UTF8(unicode) = NULL;
1506 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1508#endif
1509 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1510 }
1511 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001512 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001513 return 0;
1514}
1515
Alexander Belopolsky40018472011-02-26 01:02:56 +00001516static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001517unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518{
Walter Dörwald16807132007-05-25 13:52:07 +00001519 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001520 case SSTATE_NOT_INTERNED:
1521 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001522
Benjamin Peterson29060642009-01-31 22:14:21 +00001523 case SSTATE_INTERNED_MORTAL:
1524 /* revive dead object temporarily for DelItem */
1525 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001526 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001527 Py_FatalError(
1528 "deletion of interned string failed");
1529 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 case SSTATE_INTERNED_IMMORTAL:
1532 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001533
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 default:
1535 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001536 }
1537
Victor Stinner03490912011-10-03 23:45:12 +02001538 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001539 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001540 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001541 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001542 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1543 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001545 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001546}
1547
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548#ifdef Py_DEBUG
1549static int
1550unicode_is_singleton(PyObject *unicode)
1551{
1552 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1553 if (unicode == unicode_empty)
1554 return 1;
1555 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1556 {
1557 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1558 if (ch < 256 && unicode_latin1[ch] == unicode)
1559 return 1;
1560 }
1561 return 0;
1562}
1563#endif
1564
Alexander Belopolsky40018472011-02-26 01:02:56 +00001565static int
Victor Stinner488fa492011-12-12 00:01:39 +01001566unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001567{
Victor Stinner488fa492011-12-12 00:01:39 +01001568 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001569 if (Py_REFCNT(unicode) != 1)
1570 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (_PyUnicode_HASH(unicode) != -1)
1572 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001573 if (PyUnicode_CHECK_INTERNED(unicode))
1574 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001575 if (!PyUnicode_CheckExact(unicode))
1576 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001577#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001578 /* singleton refcount is greater than 1 */
1579 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001580#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 return 1;
1582}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584static int
1585unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1586{
1587 PyObject *unicode;
1588 Py_ssize_t old_length;
1589
1590 assert(p_unicode != NULL);
1591 unicode = *p_unicode;
1592
1593 assert(unicode != NULL);
1594 assert(PyUnicode_Check(unicode));
1595 assert(0 <= length);
1596
Victor Stinner910337b2011-10-03 03:20:16 +02001597 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001598 old_length = PyUnicode_WSTR_LENGTH(unicode);
1599 else
1600 old_length = PyUnicode_GET_LENGTH(unicode);
1601 if (old_length == length)
1602 return 0;
1603
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001604 if (length == 0) {
1605 Py_DECREF(*p_unicode);
1606 *p_unicode = unicode_empty;
1607 Py_INCREF(*p_unicode);
1608 return 0;
1609 }
1610
Victor Stinner488fa492011-12-12 00:01:39 +01001611 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001612 PyObject *copy = resize_copy(unicode, length);
1613 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001614 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 Py_DECREF(*p_unicode);
1616 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618 }
1619
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001621 PyObject *new_unicode = resize_compact(unicode, length);
1622 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001624 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001625 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001626 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001627 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001628}
1629
Alexander Belopolsky40018472011-02-26 01:02:56 +00001630int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 PyObject *unicode;
1634 if (p_unicode == NULL) {
1635 PyErr_BadInternalCall();
1636 return -1;
1637 }
1638 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001639 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001640 {
1641 PyErr_BadInternalCall();
1642 return -1;
1643 }
1644 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001645}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001646
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001648unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1649 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001650{
1651 PyObject *result;
1652 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001653 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001654 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1655 return 0;
1656 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1657 maxchar);
1658 if (result == NULL)
1659 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001660 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 Py_DECREF(*p_unicode);
1662 *p_unicode = result;
1663 return 0;
1664}
1665
1666static int
1667unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1668 Py_UCS4 ch)
1669{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001670 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001671 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001672 return -1;
1673 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1674 PyUnicode_DATA(*p_unicode),
1675 (*pos)++, ch);
1676 return 0;
1677}
1678
Victor Stinnerc5166102012-02-22 13:55:02 +01001679/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001680
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001681 WARNING: The function doesn't copy the terminating null character and
1682 doesn't check the maximum character (may write a latin1 character in an
1683 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001684static void
1685unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1686 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001687{
1688 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1689 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001690 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001691
1692 switch (kind) {
1693 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001694 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001695#ifdef Py_DEBUG
1696 if (PyUnicode_IS_ASCII(unicode)) {
1697 Py_UCS4 maxchar = ucs1lib_find_max_char(
1698 (const Py_UCS1*)str,
1699 (const Py_UCS1*)str + len);
1700 assert(maxchar < 128);
1701 }
1702#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001703 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001704 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001705 }
1706 case PyUnicode_2BYTE_KIND: {
1707 Py_UCS2 *start = (Py_UCS2 *)data + index;
1708 Py_UCS2 *ucs2 = start;
1709 assert(index <= PyUnicode_GET_LENGTH(unicode));
1710
Victor Stinner184252a2012-06-16 02:57:41 +02001711 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001712 *ucs2 = (Py_UCS2)*str;
1713
1714 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001715 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001716 }
1717 default: {
1718 Py_UCS4 *start = (Py_UCS4 *)data + index;
1719 Py_UCS4 *ucs4 = start;
1720 assert(kind == PyUnicode_4BYTE_KIND);
1721 assert(index <= PyUnicode_GET_LENGTH(unicode));
1722
Victor Stinner184252a2012-06-16 02:57:41 +02001723 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001724 *ucs4 = (Py_UCS4)*str;
1725
1726 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001727 }
1728 }
1729}
1730
1731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732static PyObject*
1733get_latin1_char(unsigned char ch)
1734{
Victor Stinnera464fc12011-10-02 20:39:30 +02001735 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001737 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 if (!unicode)
1739 return NULL;
1740 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001741 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742 unicode_latin1[ch] = unicode;
1743 }
1744 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001745 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746}
1747
Alexander Belopolsky40018472011-02-26 01:02:56 +00001748PyObject *
1749PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001751 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 Py_UCS4 maxchar = 0;
1753 Py_ssize_t num_surrogates;
1754
1755 if (u == NULL)
1756 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001758 /* If the Unicode data is known at construction time, we can apply
1759 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 /* Optimization for empty strings */
1762 if (size == 0 && unicode_empty != NULL) {
1763 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001764 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001765 }
Tim Petersced69f82003-09-16 20:30:58 +00001766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 /* Single character Unicode objects in the Latin-1 range are
1768 shared when using this constructor */
1769 if (size == 1 && *u < 256)
1770 return get_latin1_char((unsigned char)*u);
1771
1772 /* If not empty and not single character, copy the Unicode data
1773 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001774 if (find_maxchar_surrogates(u, u + size,
1775 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 return NULL;
1777
Victor Stinner8faf8212011-12-08 22:14:11 +01001778 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 if (!unicode)
1780 return NULL;
1781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 switch (PyUnicode_KIND(unicode)) {
1783 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001784 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1786 break;
1787 case PyUnicode_2BYTE_KIND:
1788#if Py_UNICODE_SIZE == 2
1789 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1790#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001791 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1793#endif
1794 break;
1795 case PyUnicode_4BYTE_KIND:
1796#if SIZEOF_WCHAR_T == 2
1797 /* This is the only case which has to process surrogates, thus
1798 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001799 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800#else
1801 assert(num_surrogates == 0);
1802 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1803#endif
1804 break;
1805 default:
1806 assert(0 && "Impossible state");
1807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001809 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810}
1811
Alexander Belopolsky40018472011-02-26 01:02:56 +00001812PyObject *
1813PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001814{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001815 if (size < 0) {
1816 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001817 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001818 return NULL;
1819 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001820 if (u != NULL)
1821 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1822 else
1823 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001824}
1825
Alexander Belopolsky40018472011-02-26 01:02:56 +00001826PyObject *
1827PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001828{
1829 size_t size = strlen(u);
1830 if (size > PY_SSIZE_T_MAX) {
1831 PyErr_SetString(PyExc_OverflowError, "input too long");
1832 return NULL;
1833 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001834 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001835}
1836
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001837PyObject *
1838_PyUnicode_FromId(_Py_Identifier *id)
1839{
1840 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001841 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1842 strlen(id->string),
1843 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001844 if (!id->object)
1845 return NULL;
1846 PyUnicode_InternInPlace(&id->object);
1847 assert(!id->next);
1848 id->next = static_strings;
1849 static_strings = id;
1850 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001851 return id->object;
1852}
1853
1854void
1855_PyUnicode_ClearStaticStrings()
1856{
1857 _Py_Identifier *i;
1858 for (i = static_strings; i; i = i->next) {
1859 Py_DECREF(i->object);
1860 i->object = NULL;
1861 i->next = NULL;
1862 }
1863}
1864
Benjamin Peterson0df54292012-03-26 14:50:32 -04001865/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001866
Victor Stinnerd3f08822012-05-29 12:57:52 +02001867PyObject*
1868_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001869{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001870 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001871 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001872 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001873#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001874 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001875#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001876 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001877 }
Victor Stinner785938e2011-12-11 20:09:03 +01001878 unicode = PyUnicode_New(size, 127);
1879 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001880 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001881 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1882 assert(_PyUnicode_CheckConsistency(unicode, 1));
1883 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001884}
1885
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001886static Py_UCS4
1887kind_maxchar_limit(unsigned int kind)
1888{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001889 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001890 case PyUnicode_1BYTE_KIND:
1891 return 0x80;
1892 case PyUnicode_2BYTE_KIND:
1893 return 0x100;
1894 case PyUnicode_4BYTE_KIND:
1895 return 0x10000;
1896 default:
1897 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001898 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001899 }
1900}
1901
Victor Stinnere6abb482012-05-02 01:15:40 +02001902Py_LOCAL_INLINE(Py_UCS4)
1903align_maxchar(Py_UCS4 maxchar)
1904{
1905 if (maxchar <= 127)
1906 return 127;
1907 else if (maxchar <= 255)
1908 return 255;
1909 else if (maxchar <= 65535)
1910 return 65535;
1911 else
1912 return MAX_UNICODE;
1913}
1914
Victor Stinner702c7342011-10-05 13:50:52 +02001915static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001916_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001917{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001919 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001920
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001921 if (size == 0) {
1922 Py_INCREF(unicode_empty);
1923 return unicode_empty;
1924 }
1925 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001926 if (size == 1)
1927 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001928
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001929 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001930 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001931 if (!res)
1932 return NULL;
1933 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001934 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001936}
1937
Victor Stinnere57b1c02011-09-28 22:20:48 +02001938static PyObject*
1939_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940{
1941 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001942 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001943
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001944 if (size == 0) {
1945 Py_INCREF(unicode_empty);
1946 return unicode_empty;
1947 }
1948 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001949 if (size == 1) {
1950 Py_UCS4 ch = u[0];
1951 if (ch < 256)
1952 return get_latin1_char((unsigned char)ch);
1953
1954 res = PyUnicode_New(1, ch);
1955 if (res == NULL)
1956 return NULL;
1957 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1958 assert(_PyUnicode_CheckConsistency(res, 1));
1959 return res;
1960 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001961
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001962 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001963 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 if (!res)
1965 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001966 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001968 else {
1969 _PyUnicode_CONVERT_BYTES(
1970 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1971 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001972 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 return res;
1974}
1975
Victor Stinnere57b1c02011-09-28 22:20:48 +02001976static PyObject*
1977_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978{
1979 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001980 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001981
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001982 if (size == 0) {
1983 Py_INCREF(unicode_empty);
1984 return unicode_empty;
1985 }
1986 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001987 if (size == 1) {
1988 Py_UCS4 ch = u[0];
1989 if (ch < 256)
1990 return get_latin1_char((unsigned char)ch);
1991
1992 res = PyUnicode_New(1, ch);
1993 if (res == NULL)
1994 return NULL;
1995 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1996 assert(_PyUnicode_CheckConsistency(res, 1));
1997 return res;
1998 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001999
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002000 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002001 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 if (!res)
2003 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002004 if (max_char < 256)
2005 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2006 PyUnicode_1BYTE_DATA(res));
2007 else if (max_char < 0x10000)
2008 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2009 PyUnicode_2BYTE_DATA(res));
2010 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002012 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 return res;
2014}
2015
2016PyObject*
2017PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2018{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002019 if (size < 0) {
2020 PyErr_SetString(PyExc_ValueError, "size must be positive");
2021 return NULL;
2022 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002023 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002025 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002027 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002029 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002030 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002031 PyErr_SetString(PyExc_SystemError, "invalid kind");
2032 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034}
2035
Victor Stinnerece58de2012-04-23 23:36:38 +02002036Py_UCS4
2037_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2038{
2039 enum PyUnicode_Kind kind;
2040 void *startptr, *endptr;
2041
2042 assert(PyUnicode_IS_READY(unicode));
2043 assert(0 <= start);
2044 assert(end <= PyUnicode_GET_LENGTH(unicode));
2045 assert(start <= end);
2046
2047 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2048 return PyUnicode_MAX_CHAR_VALUE(unicode);
2049
2050 if (start == end)
2051 return 127;
2052
Victor Stinner94d558b2012-04-27 22:26:58 +02002053 if (PyUnicode_IS_ASCII(unicode))
2054 return 127;
2055
Victor Stinnerece58de2012-04-23 23:36:38 +02002056 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002057 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002058 endptr = (char *)startptr + end * kind;
2059 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002060 switch(kind) {
2061 case PyUnicode_1BYTE_KIND:
2062 return ucs1lib_find_max_char(startptr, endptr);
2063 case PyUnicode_2BYTE_KIND:
2064 return ucs2lib_find_max_char(startptr, endptr);
2065 case PyUnicode_4BYTE_KIND:
2066 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002067 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002068 assert(0);
2069 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002070 }
2071}
2072
Victor Stinner25a4b292011-10-06 12:31:55 +02002073/* Ensure that a string uses the most efficient storage, if it is not the
2074 case: create a new string with of the right kind. Write NULL into *p_unicode
2075 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002076static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002077unicode_adjust_maxchar(PyObject **p_unicode)
2078{
2079 PyObject *unicode, *copy;
2080 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002081 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002082 unsigned int kind;
2083
2084 assert(p_unicode != NULL);
2085 unicode = *p_unicode;
2086 assert(PyUnicode_IS_READY(unicode));
2087 if (PyUnicode_IS_ASCII(unicode))
2088 return;
2089
2090 len = PyUnicode_GET_LENGTH(unicode);
2091 kind = PyUnicode_KIND(unicode);
2092 if (kind == PyUnicode_1BYTE_KIND) {
2093 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002094 max_char = ucs1lib_find_max_char(u, u + len);
2095 if (max_char >= 128)
2096 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002097 }
2098 else if (kind == PyUnicode_2BYTE_KIND) {
2099 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002100 max_char = ucs2lib_find_max_char(u, u + len);
2101 if (max_char >= 256)
2102 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002103 }
2104 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002105 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002106 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002107 max_char = ucs4lib_find_max_char(u, u + len);
2108 if (max_char >= 0x10000)
2109 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002110 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002111 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002112 if (copy != NULL)
2113 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002114 Py_DECREF(unicode);
2115 *p_unicode = copy;
2116}
2117
Victor Stinner034f6cf2011-09-30 02:26:44 +02002118PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002119_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002120{
Victor Stinner87af4f22011-11-21 23:03:47 +01002121 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002122 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002123
Victor Stinner034f6cf2011-09-30 02:26:44 +02002124 if (!PyUnicode_Check(unicode)) {
2125 PyErr_BadInternalCall();
2126 return NULL;
2127 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002128 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002129 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002130
Victor Stinner87af4f22011-11-21 23:03:47 +01002131 length = PyUnicode_GET_LENGTH(unicode);
2132 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002133 if (!copy)
2134 return NULL;
2135 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2136
Victor Stinner87af4f22011-11-21 23:03:47 +01002137 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2138 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002139 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002140 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002141}
2142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002143
Victor Stinnerbc603d12011-10-02 01:00:40 +02002144/* Widen Unicode objects to larger buffers. Don't write terminating null
2145 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146
2147void*
2148_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2149{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002150 Py_ssize_t len;
2151 void *result;
2152 unsigned int skind;
2153
Benjamin Petersonbac79492012-01-14 13:34:47 -05002154 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002155 return NULL;
2156
2157 len = PyUnicode_GET_LENGTH(s);
2158 skind = PyUnicode_KIND(s);
2159 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002160 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002161 return NULL;
2162 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002163 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002164 case PyUnicode_2BYTE_KIND:
2165 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2166 if (!result)
2167 return PyErr_NoMemory();
2168 assert(skind == PyUnicode_1BYTE_KIND);
2169 _PyUnicode_CONVERT_BYTES(
2170 Py_UCS1, Py_UCS2,
2171 PyUnicode_1BYTE_DATA(s),
2172 PyUnicode_1BYTE_DATA(s) + len,
2173 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002175 case PyUnicode_4BYTE_KIND:
2176 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2177 if (!result)
2178 return PyErr_NoMemory();
2179 if (skind == PyUnicode_2BYTE_KIND) {
2180 _PyUnicode_CONVERT_BYTES(
2181 Py_UCS2, Py_UCS4,
2182 PyUnicode_2BYTE_DATA(s),
2183 PyUnicode_2BYTE_DATA(s) + len,
2184 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002186 else {
2187 assert(skind == PyUnicode_1BYTE_KIND);
2188 _PyUnicode_CONVERT_BYTES(
2189 Py_UCS1, Py_UCS4,
2190 PyUnicode_1BYTE_DATA(s),
2191 PyUnicode_1BYTE_DATA(s) + len,
2192 result);
2193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002195 default:
2196 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 }
Victor Stinner01698042011-10-04 00:04:26 +02002198 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 return NULL;
2200}
2201
2202static Py_UCS4*
2203as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2204 int copy_null)
2205{
2206 int kind;
2207 void *data;
2208 Py_ssize_t len, targetlen;
2209 if (PyUnicode_READY(string) == -1)
2210 return NULL;
2211 kind = PyUnicode_KIND(string);
2212 data = PyUnicode_DATA(string);
2213 len = PyUnicode_GET_LENGTH(string);
2214 targetlen = len;
2215 if (copy_null)
2216 targetlen++;
2217 if (!target) {
2218 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2219 PyErr_NoMemory();
2220 return NULL;
2221 }
2222 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2223 if (!target) {
2224 PyErr_NoMemory();
2225 return NULL;
2226 }
2227 }
2228 else {
2229 if (targetsize < targetlen) {
2230 PyErr_Format(PyExc_SystemError,
2231 "string is longer than the buffer");
2232 if (copy_null && 0 < targetsize)
2233 target[0] = 0;
2234 return NULL;
2235 }
2236 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002237 if (kind == PyUnicode_1BYTE_KIND) {
2238 Py_UCS1 *start = (Py_UCS1 *) data;
2239 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002241 else if (kind == PyUnicode_2BYTE_KIND) {
2242 Py_UCS2 *start = (Py_UCS2 *) data;
2243 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2244 }
2245 else {
2246 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 if (copy_null)
2250 target[len] = 0;
2251 return target;
2252}
2253
2254Py_UCS4*
2255PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2256 int copy_null)
2257{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002258 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 PyErr_BadInternalCall();
2260 return NULL;
2261 }
2262 return as_ucs4(string, target, targetsize, copy_null);
2263}
2264
2265Py_UCS4*
2266PyUnicode_AsUCS4Copy(PyObject *string)
2267{
2268 return as_ucs4(string, NULL, 0, 1);
2269}
2270
2271#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002272
Alexander Belopolsky40018472011-02-26 01:02:56 +00002273PyObject *
2274PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002275{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002277 if (size == 0) {
2278 Py_INCREF(unicode_empty);
2279 return unicode_empty;
2280 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002281 PyErr_BadInternalCall();
2282 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283 }
2284
Martin v. Löwis790465f2008-04-05 20:41:37 +00002285 if (size == -1) {
2286 size = wcslen(w);
2287 }
2288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290}
2291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002292#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002293
Walter Dörwald346737f2007-05-31 10:44:43 +00002294static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002295makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002296 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002297{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002298 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002299 if (longflag)
2300 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002301 else if (longlongflag) {
2302 /* longlongflag should only ever be nonzero on machines with
2303 HAVE_LONG_LONG defined */
2304#ifdef HAVE_LONG_LONG
2305 char *f = PY_FORMAT_LONG_LONG;
2306 while (*f)
2307 *fmt++ = *f++;
2308#else
2309 /* we shouldn't ever get here */
2310 assert(0);
2311 *fmt++ = 'l';
2312#endif
2313 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002314 else if (size_tflag) {
2315 char *f = PY_FORMAT_SIZE_T;
2316 while (*f)
2317 *fmt++ = *f++;
2318 }
2319 *fmt++ = c;
2320 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002321}
2322
Victor Stinner15a11362012-10-06 23:48:20 +02002323/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002324 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2325 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2326#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002327
2328static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002329unicode_fromformat_arg(_PyUnicodeWriter *writer,
2330 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002331{
Victor Stinnere215d962012-10-06 23:03:36 +02002332 const char *p;
2333 Py_ssize_t len;
2334 int zeropad;
2335 int width;
2336 int precision;
2337 int longflag;
2338 int longlongflag;
2339 int size_tflag;
2340 int fill;
2341
2342 p = f;
2343 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002344 zeropad = 0;
2345 if (*f == '0') {
2346 zeropad = 1;
2347 f++;
2348 }
Victor Stinner96865452011-03-01 23:44:09 +00002349
2350 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002351 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002352 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002353 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2354 PyErr_SetString(PyExc_ValueError,
2355 "width too big");
2356 return NULL;
2357 }
Victor Stinnere215d962012-10-06 23:03:36 +02002358 width = (width*10) + (*f - '0');
2359 f++;
2360 }
Victor Stinner96865452011-03-01 23:44:09 +00002361 precision = 0;
2362 if (*f == '.') {
2363 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002364 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002365 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2366 PyErr_SetString(PyExc_ValueError,
2367 "precision too big");
2368 return NULL;
2369 }
Victor Stinnere215d962012-10-06 23:03:36 +02002370 precision = (precision*10) + (*f - '0');
2371 f++;
2372 }
Victor Stinner96865452011-03-01 23:44:09 +00002373 if (*f == '%') {
2374 /* "%.3%s" => f points to "3" */
2375 f--;
2376 }
2377 }
2378 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002379 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002380 f--;
2381 }
Victor Stinner96865452011-03-01 23:44:09 +00002382
2383 /* Handle %ld, %lu, %lld and %llu. */
2384 longflag = 0;
2385 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002386 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002387 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002388 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002389 longflag = 1;
2390 ++f;
2391 }
2392#ifdef HAVE_LONG_LONG
2393 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002394 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002395 longlongflag = 1;
2396 f += 2;
2397 }
2398#endif
2399 }
2400 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002401 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002402 size_tflag = 1;
2403 ++f;
2404 }
Victor Stinnere215d962012-10-06 23:03:36 +02002405
2406 if (f[1] == '\0')
2407 writer->overallocate = 0;
2408
2409 switch (*f) {
2410 case 'c':
2411 {
2412 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002413 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2414 PyErr_SetString(PyExc_ValueError,
2415 "character argument not in range(0x110000)");
2416 return NULL;
2417 }
Victor Stinnere215d962012-10-06 23:03:36 +02002418 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2419 return NULL;
2420 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2421 writer->pos++;
2422 break;
2423 }
2424
2425 case 'i':
2426 case 'd':
2427 case 'u':
2428 case 'x':
2429 {
2430 /* used by sprintf */
2431 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002432 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002433
2434 if (*f == 'u') {
2435 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2436
2437 if (longflag)
2438 len = sprintf(buffer, fmt,
2439 va_arg(*vargs, unsigned long));
2440#ifdef HAVE_LONG_LONG
2441 else if (longlongflag)
2442 len = sprintf(buffer, fmt,
2443 va_arg(*vargs, unsigned PY_LONG_LONG));
2444#endif
2445 else if (size_tflag)
2446 len = sprintf(buffer, fmt,
2447 va_arg(*vargs, size_t));
2448 else
2449 len = sprintf(buffer, fmt,
2450 va_arg(*vargs, unsigned int));
2451 }
2452 else if (*f == 'x') {
2453 makefmt(fmt, 0, 0, 0, 'x');
2454 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2455 }
2456 else {
2457 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2458
2459 if (longflag)
2460 len = sprintf(buffer, fmt,
2461 va_arg(*vargs, long));
2462#ifdef HAVE_LONG_LONG
2463 else if (longlongflag)
2464 len = sprintf(buffer, fmt,
2465 va_arg(*vargs, PY_LONG_LONG));
2466#endif
2467 else if (size_tflag)
2468 len = sprintf(buffer, fmt,
2469 va_arg(*vargs, Py_ssize_t));
2470 else
2471 len = sprintf(buffer, fmt,
2472 va_arg(*vargs, int));
2473 }
2474 assert(len >= 0);
2475
Victor Stinnere215d962012-10-06 23:03:36 +02002476 if (precision < len)
2477 precision = len;
2478 if (width > precision) {
2479 Py_UCS4 fillchar;
2480 fill = width - precision;
2481 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002482 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2483 return NULL;
2484 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2485 return NULL;
2486 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002487 }
Victor Stinner15a11362012-10-06 23:48:20 +02002488 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002489 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002490 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2491 return NULL;
2492 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2493 return NULL;
2494 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002495 }
Victor Stinner15a11362012-10-06 23:48:20 +02002496 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002497 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002498 break;
2499 }
2500
2501 case 'p':
2502 {
2503 char number[MAX_LONG_LONG_CHARS];
2504
2505 len = sprintf(number, "%p", va_arg(*vargs, void*));
2506 assert(len >= 0);
2507
2508 /* %p is ill-defined: ensure leading 0x. */
2509 if (number[1] == 'X')
2510 number[1] = 'x';
2511 else if (number[1] != 'x') {
2512 memmove(number + 2, number,
2513 strlen(number) + 1);
2514 number[0] = '0';
2515 number[1] = 'x';
2516 len += 2;
2517 }
2518
2519 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2520 return NULL;
2521 break;
2522 }
2523
2524 case 's':
2525 {
2526 /* UTF-8 */
2527 const char *s = va_arg(*vargs, const char*);
2528 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2529 if (!str)
2530 return NULL;
2531 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2532 Py_DECREF(str);
2533 return NULL;
2534 }
2535 Py_DECREF(str);
2536 break;
2537 }
2538
2539 case 'U':
2540 {
2541 PyObject *obj = va_arg(*vargs, PyObject *);
2542 assert(obj && _PyUnicode_CHECK(obj));
2543
2544 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2545 return NULL;
2546 break;
2547 }
2548
2549 case 'V':
2550 {
2551 PyObject *obj = va_arg(*vargs, PyObject *);
2552 const char *str = va_arg(*vargs, const char *);
2553 PyObject *str_obj;
2554 assert(obj || str);
2555 if (obj) {
2556 assert(_PyUnicode_CHECK(obj));
2557 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2558 return NULL;
2559 }
2560 else {
2561 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2562 if (!str_obj)
2563 return NULL;
2564 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2565 Py_DECREF(str_obj);
2566 return NULL;
2567 }
2568 Py_DECREF(str_obj);
2569 }
2570 break;
2571 }
2572
2573 case 'S':
2574 {
2575 PyObject *obj = va_arg(*vargs, PyObject *);
2576 PyObject *str;
2577 assert(obj);
2578 str = PyObject_Str(obj);
2579 if (!str)
2580 return NULL;
2581 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2582 Py_DECREF(str);
2583 return NULL;
2584 }
2585 Py_DECREF(str);
2586 break;
2587 }
2588
2589 case 'R':
2590 {
2591 PyObject *obj = va_arg(*vargs, PyObject *);
2592 PyObject *repr;
2593 assert(obj);
2594 repr = PyObject_Repr(obj);
2595 if (!repr)
2596 return NULL;
2597 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2598 Py_DECREF(repr);
2599 return NULL;
2600 }
2601 Py_DECREF(repr);
2602 break;
2603 }
2604
2605 case 'A':
2606 {
2607 PyObject *obj = va_arg(*vargs, PyObject *);
2608 PyObject *ascii;
2609 assert(obj);
2610 ascii = PyObject_ASCII(obj);
2611 if (!ascii)
2612 return NULL;
2613 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2614 Py_DECREF(ascii);
2615 return NULL;
2616 }
2617 Py_DECREF(ascii);
2618 break;
2619 }
2620
2621 case '%':
2622 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2623 return NULL;
2624 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2625 writer->pos++;
2626 break;
2627
2628 default:
2629 /* if we stumble upon an unknown formatting code, copy the rest
2630 of the format string to the output string. (we cannot just
2631 skip the code, since there's no way to know what's in the
2632 argument list) */
2633 len = strlen(p);
2634 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2635 return NULL;
2636 f = p+len;
2637 return f;
2638 }
2639
2640 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002641 return f;
2642}
2643
Walter Dörwaldd2034312007-05-18 16:29:38 +00002644PyObject *
2645PyUnicode_FromFormatV(const char *format, va_list vargs)
2646{
Victor Stinnere215d962012-10-06 23:03:36 +02002647 va_list vargs2;
2648 const char *f;
2649 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002650
Victor Stinnere215d962012-10-06 23:03:36 +02002651 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2652
2653 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2654 Copy it to be able to pass a reference to a subfunction. */
2655 Py_VA_COPY(vargs2, vargs);
2656
2657 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002658 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002659 f = unicode_fromformat_arg(&writer, f, &vargs2);
2660 if (f == NULL)
2661 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002664 const char *p;
2665 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002666
Victor Stinnere215d962012-10-06 23:03:36 +02002667 p = f;
2668 do
2669 {
2670 if ((unsigned char)*p > 127) {
2671 PyErr_Format(PyExc_ValueError,
2672 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2673 "string, got a non-ASCII byte: 0x%02x",
2674 (unsigned char)*p);
2675 return NULL;
2676 }
2677 p++;
2678 }
2679 while (*p != '\0' && *p != '%');
2680 len = p - f;
2681
2682 if (*p == '\0')
2683 writer.overallocate = 0;
2684 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2685 goto fail;
2686 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2687 writer.pos += len;
2688
2689 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002690 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002691 }
Victor Stinnere215d962012-10-06 23:03:36 +02002692 return _PyUnicodeWriter_Finish(&writer);
2693
2694 fail:
2695 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002697}
2698
Walter Dörwaldd2034312007-05-18 16:29:38 +00002699PyObject *
2700PyUnicode_FromFormat(const char *format, ...)
2701{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 PyObject* ret;
2703 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002704
2705#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002709#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 ret = PyUnicode_FromFormatV(format, vargs);
2711 va_end(vargs);
2712 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713}
2714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715#ifdef HAVE_WCHAR_H
2716
Victor Stinner5593d8a2010-10-02 11:11:27 +00002717/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2718 convert a Unicode object to a wide character string.
2719
Victor Stinnerd88d9832011-09-06 02:00:05 +02002720 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002721 character) required to convert the unicode object. Ignore size argument.
2722
Victor Stinnerd88d9832011-09-06 02:00:05 +02002723 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002724 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002725 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002726static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002727unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002728 wchar_t *w,
2729 Py_ssize_t size)
2730{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732 const wchar_t *wstr;
2733
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002734 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 if (wstr == NULL)
2736 return -1;
2737
Victor Stinner5593d8a2010-10-02 11:11:27 +00002738 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002739 if (size > res)
2740 size = res + 1;
2741 else
2742 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002744 return res;
2745 }
2746 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002748}
2749
2750Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002751PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002752 wchar_t *w,
2753 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754{
2755 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002756 PyErr_BadInternalCall();
2757 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002759 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760}
2761
Victor Stinner137c34c2010-09-29 10:25:54 +00002762wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002763PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002764 Py_ssize_t *size)
2765{
2766 wchar_t* buffer;
2767 Py_ssize_t buflen;
2768
2769 if (unicode == NULL) {
2770 PyErr_BadInternalCall();
2771 return NULL;
2772 }
2773
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002774 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 if (buflen == -1)
2776 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002777 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002778 PyErr_NoMemory();
2779 return NULL;
2780 }
2781
Victor Stinner137c34c2010-09-29 10:25:54 +00002782 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2783 if (buffer == NULL) {
2784 PyErr_NoMemory();
2785 return NULL;
2786 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002787 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002788 if (buflen == -1) {
2789 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002790 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002791 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002792 if (size != NULL)
2793 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002794 return buffer;
2795}
2796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002797#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798
Alexander Belopolsky40018472011-02-26 01:02:56 +00002799PyObject *
2800PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002803 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002804 PyErr_SetString(PyExc_ValueError,
2805 "chr() arg not in range(0x110000)");
2806 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002807 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002809 if (ordinal < 256)
2810 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002812 v = PyUnicode_New(1, ordinal);
2813 if (v == NULL)
2814 return NULL;
2815 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002816 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002817 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002818}
2819
Alexander Belopolsky40018472011-02-26 01:02:56 +00002820PyObject *
2821PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002823 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002824 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002826 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002827 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002828 Py_INCREF(obj);
2829 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002830 }
2831 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 /* For a Unicode subtype that's not a Unicode object,
2833 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002834 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002835 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002836 PyErr_Format(PyExc_TypeError,
2837 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002838 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002839 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002840}
2841
Alexander Belopolsky40018472011-02-26 01:02:56 +00002842PyObject *
2843PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002844 const char *encoding,
2845 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002846{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002847 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002848 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002849
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002851 PyErr_BadInternalCall();
2852 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002854
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002855 /* Decoding bytes objects is the most common case and should be fast */
2856 if (PyBytes_Check(obj)) {
2857 if (PyBytes_GET_SIZE(obj) == 0) {
2858 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002859 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002860 }
2861 else {
2862 v = PyUnicode_Decode(
2863 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2864 encoding, errors);
2865 }
2866 return v;
2867 }
2868
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002869 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002870 PyErr_SetString(PyExc_TypeError,
2871 "decoding str is not supported");
2872 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002873 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002874
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002875 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2876 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2877 PyErr_Format(PyExc_TypeError,
2878 "coercing to str: need bytes, bytearray "
2879 "or buffer-like object, %.80s found",
2880 Py_TYPE(obj)->tp_name);
2881 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002882 }
Tim Petersced69f82003-09-16 20:30:58 +00002883
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002884 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002886 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 }
Tim Petersced69f82003-09-16 20:30:58 +00002888 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002889 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002890
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002891 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002892 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893}
2894
Victor Stinner600d3be2010-06-10 12:00:55 +00002895/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002896 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2897 1 on success. */
2898static int
2899normalize_encoding(const char *encoding,
2900 char *lower,
2901 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002903 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002904 char *l;
2905 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002906
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002907 if (encoding == NULL) {
2908 strcpy(lower, "utf-8");
2909 return 1;
2910 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002911 e = encoding;
2912 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002913 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002914 while (*e) {
2915 if (l == l_end)
2916 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002917 if (Py_ISUPPER(*e)) {
2918 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002919 }
2920 else if (*e == '_') {
2921 *l++ = '-';
2922 e++;
2923 }
2924 else {
2925 *l++ = *e++;
2926 }
2927 }
2928 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002929 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002930}
2931
Alexander Belopolsky40018472011-02-26 01:02:56 +00002932PyObject *
2933PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002934 Py_ssize_t size,
2935 const char *encoding,
2936 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002937{
2938 PyObject *buffer = NULL, *unicode;
2939 Py_buffer info;
2940 char lower[11]; /* Enough for any encoding shortcut */
2941
Fred Drakee4315f52000-05-09 19:53:39 +00002942 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002943 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002944 if ((strcmp(lower, "utf-8") == 0) ||
2945 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002946 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002947 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002948 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002949 (strcmp(lower, "iso-8859-1") == 0))
2950 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002951#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002952 else if (strcmp(lower, "mbcs") == 0)
2953 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002954#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002955 else if (strcmp(lower, "ascii") == 0)
2956 return PyUnicode_DecodeASCII(s, size, errors);
2957 else if (strcmp(lower, "utf-16") == 0)
2958 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2959 else if (strcmp(lower, "utf-32") == 0)
2960 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002962
2963 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002964 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002965 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002966 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002967 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968 if (buffer == NULL)
2969 goto onError;
2970 unicode = PyCodec_Decode(buffer, encoding, errors);
2971 if (unicode == NULL)
2972 goto onError;
2973 if (!PyUnicode_Check(unicode)) {
2974 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002975 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002976 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 Py_DECREF(unicode);
2978 goto onError;
2979 }
2980 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002981 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002982
Benjamin Peterson29060642009-01-31 22:14:21 +00002983 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 Py_XDECREF(buffer);
2985 return NULL;
2986}
2987
Alexander Belopolsky40018472011-02-26 01:02:56 +00002988PyObject *
2989PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002990 const char *encoding,
2991 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002992{
2993 PyObject *v;
2994
2995 if (!PyUnicode_Check(unicode)) {
2996 PyErr_BadArgument();
2997 goto onError;
2998 }
2999
3000 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003001 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003002
3003 /* Decode via the codec registry */
3004 v = PyCodec_Decode(unicode, encoding, errors);
3005 if (v == NULL)
3006 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003007 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003008
Benjamin Peterson29060642009-01-31 22:14:21 +00003009 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003010 return NULL;
3011}
3012
Alexander Belopolsky40018472011-02-26 01:02:56 +00003013PyObject *
3014PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003015 const char *encoding,
3016 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003017{
3018 PyObject *v;
3019
3020 if (!PyUnicode_Check(unicode)) {
3021 PyErr_BadArgument();
3022 goto onError;
3023 }
3024
3025 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003027
3028 /* Decode via the codec registry */
3029 v = PyCodec_Decode(unicode, encoding, errors);
3030 if (v == NULL)
3031 goto onError;
3032 if (!PyUnicode_Check(v)) {
3033 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003034 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003035 Py_TYPE(v)->tp_name);
3036 Py_DECREF(v);
3037 goto onError;
3038 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003039 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003040
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003042 return NULL;
3043}
3044
Alexander Belopolsky40018472011-02-26 01:02:56 +00003045PyObject *
3046PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003047 Py_ssize_t size,
3048 const char *encoding,
3049 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050{
3051 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003052
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 unicode = PyUnicode_FromUnicode(s, size);
3054 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003055 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3057 Py_DECREF(unicode);
3058 return v;
3059}
3060
Alexander Belopolsky40018472011-02-26 01:02:56 +00003061PyObject *
3062PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003063 const char *encoding,
3064 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003065{
3066 PyObject *v;
3067
3068 if (!PyUnicode_Check(unicode)) {
3069 PyErr_BadArgument();
3070 goto onError;
3071 }
3072
3073 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003074 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003075
3076 /* Encode via the codec registry */
3077 v = PyCodec_Encode(unicode, encoding, errors);
3078 if (v == NULL)
3079 goto onError;
3080 return v;
3081
Benjamin Peterson29060642009-01-31 22:14:21 +00003082 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003083 return NULL;
3084}
3085
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003086static size_t
3087wcstombs_errorpos(const wchar_t *wstr)
3088{
3089 size_t len;
3090#if SIZEOF_WCHAR_T == 2
3091 wchar_t buf[3];
3092#else
3093 wchar_t buf[2];
3094#endif
3095 char outbuf[MB_LEN_MAX];
3096 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003097
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003098#if SIZEOF_WCHAR_T == 2
3099 buf[2] = 0;
3100#else
3101 buf[1] = 0;
3102#endif
3103 start = wstr;
3104 while (*wstr != L'\0')
3105 {
3106 previous = wstr;
3107#if SIZEOF_WCHAR_T == 2
3108 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3109 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3110 {
3111 buf[0] = wstr[0];
3112 buf[1] = wstr[1];
3113 wstr += 2;
3114 }
3115 else {
3116 buf[0] = *wstr;
3117 buf[1] = 0;
3118 wstr++;
3119 }
3120#else
3121 buf[0] = *wstr;
3122 wstr++;
3123#endif
3124 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003125 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003126 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003127 }
3128
3129 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003130 return 0;
3131}
3132
Victor Stinner1b579672011-12-17 05:47:23 +01003133static int
3134locale_error_handler(const char *errors, int *surrogateescape)
3135{
3136 if (errors == NULL) {
3137 *surrogateescape = 0;
3138 return 0;
3139 }
3140
3141 if (strcmp(errors, "strict") == 0) {
3142 *surrogateescape = 0;
3143 return 0;
3144 }
3145 if (strcmp(errors, "surrogateescape") == 0) {
3146 *surrogateescape = 1;
3147 return 0;
3148 }
3149 PyErr_Format(PyExc_ValueError,
3150 "only 'strict' and 'surrogateescape' error handlers "
3151 "are supported, not '%s'",
3152 errors);
3153 return -1;
3154}
3155
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003156PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003157PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003158{
3159 Py_ssize_t wlen, wlen2;
3160 wchar_t *wstr;
3161 PyObject *bytes = NULL;
3162 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003163 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003164 PyObject *exc;
3165 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003166 int surrogateescape;
3167
3168 if (locale_error_handler(errors, &surrogateescape) < 0)
3169 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003170
3171 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3172 if (wstr == NULL)
3173 return NULL;
3174
3175 wlen2 = wcslen(wstr);
3176 if (wlen2 != wlen) {
3177 PyMem_Free(wstr);
3178 PyErr_SetString(PyExc_TypeError, "embedded null character");
3179 return NULL;
3180 }
3181
3182 if (surrogateescape) {
3183 /* locale encoding with surrogateescape */
3184 char *str;
3185
3186 str = _Py_wchar2char(wstr, &error_pos);
3187 if (str == NULL) {
3188 if (error_pos == (size_t)-1) {
3189 PyErr_NoMemory();
3190 PyMem_Free(wstr);
3191 return NULL;
3192 }
3193 else {
3194 goto encode_error;
3195 }
3196 }
3197 PyMem_Free(wstr);
3198
3199 bytes = PyBytes_FromString(str);
3200 PyMem_Free(str);
3201 }
3202 else {
3203 size_t len, len2;
3204
3205 len = wcstombs(NULL, wstr, 0);
3206 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003207 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003208 goto encode_error;
3209 }
3210
3211 bytes = PyBytes_FromStringAndSize(NULL, len);
3212 if (bytes == NULL) {
3213 PyMem_Free(wstr);
3214 return NULL;
3215 }
3216
3217 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3218 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003219 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003220 goto encode_error;
3221 }
3222 PyMem_Free(wstr);
3223 }
3224 return bytes;
3225
3226encode_error:
3227 errmsg = strerror(errno);
3228 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003229
3230 if (error_pos == (size_t)-1)
3231 error_pos = wcstombs_errorpos(wstr);
3232
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003233 PyMem_Free(wstr);
3234 Py_XDECREF(bytes);
3235
Victor Stinner2f197072011-12-17 07:08:30 +01003236 if (errmsg != NULL) {
3237 size_t errlen;
3238 wstr = _Py_char2wchar(errmsg, &errlen);
3239 if (wstr != NULL) {
3240 reason = PyUnicode_FromWideChar(wstr, errlen);
3241 PyMem_Free(wstr);
3242 } else
3243 errmsg = NULL;
3244 }
3245 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003246 reason = PyUnicode_FromString(
3247 "wcstombs() encountered an unencodable "
3248 "wide character");
3249 if (reason == NULL)
3250 return NULL;
3251
3252 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3253 "locale", unicode,
3254 (Py_ssize_t)error_pos,
3255 (Py_ssize_t)(error_pos+1),
3256 reason);
3257 Py_DECREF(reason);
3258 if (exc != NULL) {
3259 PyCodec_StrictErrors(exc);
3260 Py_XDECREF(exc);
3261 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003262 return NULL;
3263}
3264
Victor Stinnerad158722010-10-27 00:25:46 +00003265PyObject *
3266PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003267{
Victor Stinner99b95382011-07-04 14:23:54 +02003268#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003269 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003270#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003271 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003272#else
Victor Stinner793b5312011-04-27 00:24:21 +02003273 PyInterpreterState *interp = PyThreadState_GET()->interp;
3274 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3275 cannot use it to encode and decode filenames before it is loaded. Load
3276 the Python codec requires to encode at least its own filename. Use the C
3277 version of the locale codec until the codec registry is initialized and
3278 the Python codec is loaded.
3279
3280 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3281 cannot only rely on it: check also interp->fscodec_initialized for
3282 subinterpreters. */
3283 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003284 return PyUnicode_AsEncodedString(unicode,
3285 Py_FileSystemDefaultEncoding,
3286 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003287 }
3288 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003289 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003290 }
Victor Stinnerad158722010-10-27 00:25:46 +00003291#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003292}
3293
Alexander Belopolsky40018472011-02-26 01:02:56 +00003294PyObject *
3295PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003296 const char *encoding,
3297 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298{
3299 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003300 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003301
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 if (!PyUnicode_Check(unicode)) {
3303 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 }
Fred Drakee4315f52000-05-09 19:53:39 +00003306
Fred Drakee4315f52000-05-09 19:53:39 +00003307 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003308 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003309 if ((strcmp(lower, "utf-8") == 0) ||
3310 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003311 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003312 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003313 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003314 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003315 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003316 }
Victor Stinner37296e82010-06-10 13:36:23 +00003317 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003318 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003319 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003320 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003321#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003322 else if (strcmp(lower, "mbcs") == 0)
3323 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003324#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003325 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003326 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003327 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328
3329 /* Encode via the codec registry */
3330 v = PyCodec_Encode(unicode, encoding, errors);
3331 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003332 return NULL;
3333
3334 /* The normal path */
3335 if (PyBytes_Check(v))
3336 return v;
3337
3338 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003339 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003340 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003341 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003342
3343 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3344 "encoder %s returned bytearray instead of bytes",
3345 encoding);
3346 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003347 Py_DECREF(v);
3348 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003349 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003350
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003351 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3352 Py_DECREF(v);
3353 return b;
3354 }
3355
3356 PyErr_Format(PyExc_TypeError,
3357 "encoder did not return a bytes object (type=%.400s)",
3358 Py_TYPE(v)->tp_name);
3359 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003360 return NULL;
3361}
3362
Alexander Belopolsky40018472011-02-26 01:02:56 +00003363PyObject *
3364PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003365 const char *encoding,
3366 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003367{
3368 PyObject *v;
3369
3370 if (!PyUnicode_Check(unicode)) {
3371 PyErr_BadArgument();
3372 goto onError;
3373 }
3374
3375 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003376 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003377
3378 /* Encode via the codec registry */
3379 v = PyCodec_Encode(unicode, encoding, errors);
3380 if (v == NULL)
3381 goto onError;
3382 if (!PyUnicode_Check(v)) {
3383 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003384 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003385 Py_TYPE(v)->tp_name);
3386 Py_DECREF(v);
3387 goto onError;
3388 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003390
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392 return NULL;
3393}
3394
Victor Stinner2f197072011-12-17 07:08:30 +01003395static size_t
3396mbstowcs_errorpos(const char *str, size_t len)
3397{
3398#ifdef HAVE_MBRTOWC
3399 const char *start = str;
3400 mbstate_t mbs;
3401 size_t converted;
3402 wchar_t ch;
3403
3404 memset(&mbs, 0, sizeof mbs);
3405 while (len)
3406 {
3407 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3408 if (converted == 0)
3409 /* Reached end of string */
3410 break;
3411 if (converted == (size_t)-1 || converted == (size_t)-2) {
3412 /* Conversion error or incomplete character */
3413 return str - start;
3414 }
3415 else {
3416 str += converted;
3417 len -= converted;
3418 }
3419 }
3420 /* failed to find the undecodable byte sequence */
3421 return 0;
3422#endif
3423 return 0;
3424}
3425
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003426PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003427PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003428 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003429{
3430 wchar_t smallbuf[256];
3431 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3432 wchar_t *wstr;
3433 size_t wlen, wlen2;
3434 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003435 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003436 size_t error_pos;
3437 char *errmsg;
3438 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003439
3440 if (locale_error_handler(errors, &surrogateescape) < 0)
3441 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003442
3443 if (str[len] != '\0' || len != strlen(str)) {
3444 PyErr_SetString(PyExc_TypeError, "embedded null character");
3445 return NULL;
3446 }
3447
3448 if (surrogateescape)
3449 {
3450 wstr = _Py_char2wchar(str, &wlen);
3451 if (wstr == NULL) {
3452 if (wlen == (size_t)-1)
3453 PyErr_NoMemory();
3454 else
3455 PyErr_SetFromErrno(PyExc_OSError);
3456 return NULL;
3457 }
3458
3459 unicode = PyUnicode_FromWideChar(wstr, wlen);
3460 PyMem_Free(wstr);
3461 }
3462 else {
3463#ifndef HAVE_BROKEN_MBSTOWCS
3464 wlen = mbstowcs(NULL, str, 0);
3465#else
3466 wlen = len;
3467#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003468 if (wlen == (size_t)-1)
3469 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003470 if (wlen+1 <= smallbuf_len) {
3471 wstr = smallbuf;
3472 }
3473 else {
3474 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3475 return PyErr_NoMemory();
3476
3477 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3478 if (!wstr)
3479 return PyErr_NoMemory();
3480 }
3481
3482 /* This shouldn't fail now */
3483 wlen2 = mbstowcs(wstr, str, wlen+1);
3484 if (wlen2 == (size_t)-1) {
3485 if (wstr != smallbuf)
3486 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003487 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003488 }
3489#ifdef HAVE_BROKEN_MBSTOWCS
3490 assert(wlen2 == wlen);
3491#endif
3492 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3493 if (wstr != smallbuf)
3494 PyMem_Free(wstr);
3495 }
3496 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003497
3498decode_error:
3499 errmsg = strerror(errno);
3500 assert(errmsg != NULL);
3501
3502 error_pos = mbstowcs_errorpos(str, len);
3503 if (errmsg != NULL) {
3504 size_t errlen;
3505 wstr = _Py_char2wchar(errmsg, &errlen);
3506 if (wstr != NULL) {
3507 reason = PyUnicode_FromWideChar(wstr, errlen);
3508 PyMem_Free(wstr);
3509 } else
3510 errmsg = NULL;
3511 }
3512 if (errmsg == NULL)
3513 reason = PyUnicode_FromString(
3514 "mbstowcs() encountered an invalid multibyte sequence");
3515 if (reason == NULL)
3516 return NULL;
3517
3518 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3519 "locale", str, len,
3520 (Py_ssize_t)error_pos,
3521 (Py_ssize_t)(error_pos+1),
3522 reason);
3523 Py_DECREF(reason);
3524 if (exc != NULL) {
3525 PyCodec_StrictErrors(exc);
3526 Py_XDECREF(exc);
3527 }
3528 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003529}
3530
3531PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003532PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003533{
3534 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003535 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003536}
3537
3538
3539PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003540PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003541 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003542 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3543}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003544
Christian Heimes5894ba72007-11-04 11:43:14 +00003545PyObject*
3546PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3547{
Victor Stinner99b95382011-07-04 14:23:54 +02003548#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003549 return PyUnicode_DecodeMBCS(s, size, NULL);
3550#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003551 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003552#else
Victor Stinner793b5312011-04-27 00:24:21 +02003553 PyInterpreterState *interp = PyThreadState_GET()->interp;
3554 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3555 cannot use it to encode and decode filenames before it is loaded. Load
3556 the Python codec requires to encode at least its own filename. Use the C
3557 version of the locale codec until the codec registry is initialized and
3558 the Python codec is loaded.
3559
3560 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3561 cannot only rely on it: check also interp->fscodec_initialized for
3562 subinterpreters. */
3563 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003564 return PyUnicode_Decode(s, size,
3565 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003566 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003567 }
3568 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003569 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003570 }
Victor Stinnerad158722010-10-27 00:25:46 +00003571#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003572}
3573
Martin v. Löwis011e8422009-05-05 04:43:17 +00003574
3575int
Antoine Pitrou13348842012-01-29 18:36:34 +01003576_PyUnicode_HasNULChars(PyObject* s)
3577{
3578 static PyObject *nul = NULL;
3579
3580 if (nul == NULL)
3581 nul = PyUnicode_FromStringAndSize("\0", 1);
3582 if (nul == NULL)
3583 return -1;
3584 return PyUnicode_Contains(s, nul);
3585}
3586
3587
3588int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003589PyUnicode_FSConverter(PyObject* arg, void* addr)
3590{
3591 PyObject *output = NULL;
3592 Py_ssize_t size;
3593 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003594 if (arg == NULL) {
3595 Py_DECREF(*(PyObject**)addr);
3596 return 1;
3597 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003598 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003599 output = arg;
3600 Py_INCREF(output);
3601 }
3602 else {
3603 arg = PyUnicode_FromObject(arg);
3604 if (!arg)
3605 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003606 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003607 Py_DECREF(arg);
3608 if (!output)
3609 return 0;
3610 if (!PyBytes_Check(output)) {
3611 Py_DECREF(output);
3612 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3613 return 0;
3614 }
3615 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003616 size = PyBytes_GET_SIZE(output);
3617 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003618 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003619 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003620 Py_DECREF(output);
3621 return 0;
3622 }
3623 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003624 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003625}
3626
3627
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003628int
3629PyUnicode_FSDecoder(PyObject* arg, void* addr)
3630{
3631 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003632 if (arg == NULL) {
3633 Py_DECREF(*(PyObject**)addr);
3634 return 1;
3635 }
3636 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003637 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003638 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003639 output = arg;
3640 Py_INCREF(output);
3641 }
3642 else {
3643 arg = PyBytes_FromObject(arg);
3644 if (!arg)
3645 return 0;
3646 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3647 PyBytes_GET_SIZE(arg));
3648 Py_DECREF(arg);
3649 if (!output)
3650 return 0;
3651 if (!PyUnicode_Check(output)) {
3652 Py_DECREF(output);
3653 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3654 return 0;
3655 }
3656 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003657 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003658 Py_DECREF(output);
3659 return 0;
3660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003661 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003662 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003663 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3664 Py_DECREF(output);
3665 return 0;
3666 }
3667 *(PyObject**)addr = output;
3668 return Py_CLEANUP_SUPPORTED;
3669}
3670
3671
Martin v. Löwis5b222132007-06-10 09:51:05 +00003672char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003673PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003674{
Christian Heimesf3863112007-11-22 07:46:41 +00003675 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003676
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003677 if (!PyUnicode_Check(unicode)) {
3678 PyErr_BadArgument();
3679 return NULL;
3680 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003681 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003682 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003683
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003684 if (PyUnicode_UTF8(unicode) == NULL) {
3685 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003686 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3687 if (bytes == NULL)
3688 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003689 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3690 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003691 Py_DECREF(bytes);
3692 return NULL;
3693 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003694 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3695 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3696 PyBytes_AS_STRING(bytes),
3697 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003698 Py_DECREF(bytes);
3699 }
3700
3701 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003702 *psize = PyUnicode_UTF8_LENGTH(unicode);
3703 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003704}
3705
3706char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003707PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3710}
3711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003712Py_UNICODE *
3713PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3714{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003715 const unsigned char *one_byte;
3716#if SIZEOF_WCHAR_T == 4
3717 const Py_UCS2 *two_bytes;
3718#else
3719 const Py_UCS4 *four_bytes;
3720 const Py_UCS4 *ucs4_end;
3721 Py_ssize_t num_surrogates;
3722#endif
3723 wchar_t *w;
3724 wchar_t *wchar_end;
3725
3726 if (!PyUnicode_Check(unicode)) {
3727 PyErr_BadArgument();
3728 return NULL;
3729 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003730 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003731 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003732 assert(_PyUnicode_KIND(unicode) != 0);
3733 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003735 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003736#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003737 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3738 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003739 num_surrogates = 0;
3740
3741 for (; four_bytes < ucs4_end; ++four_bytes) {
3742 if (*four_bytes > 0xFFFF)
3743 ++num_surrogates;
3744 }
3745
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003746 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3747 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3748 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003749 PyErr_NoMemory();
3750 return NULL;
3751 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003752 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003754 w = _PyUnicode_WSTR(unicode);
3755 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3756 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003757 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3758 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003759 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003760 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003761 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3762 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003763 }
3764 else
3765 *w = *four_bytes;
3766
3767 if (w > wchar_end) {
3768 assert(0 && "Miscalculated string end");
3769 }
3770 }
3771 *w = 0;
3772#else
3773 /* sizeof(wchar_t) == 4 */
3774 Py_FatalError("Impossible unicode object state, wstr and str "
3775 "should share memory already.");
3776 return NULL;
3777#endif
3778 }
3779 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003780 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3781 (_PyUnicode_LENGTH(unicode) + 1));
3782 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003783 PyErr_NoMemory();
3784 return NULL;
3785 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003786 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3787 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3788 w = _PyUnicode_WSTR(unicode);
3789 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003790
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003791 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3792 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793 for (; w < wchar_end; ++one_byte, ++w)
3794 *w = *one_byte;
3795 /* null-terminate the wstr */
3796 *w = 0;
3797 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003798 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003799#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003800 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003801 for (; w < wchar_end; ++two_bytes, ++w)
3802 *w = *two_bytes;
3803 /* null-terminate the wstr */
3804 *w = 0;
3805#else
3806 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003807 PyObject_FREE(_PyUnicode_WSTR(unicode));
3808 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809 Py_FatalError("Impossible unicode object state, wstr "
3810 "and str should share memory already.");
3811 return NULL;
3812#endif
3813 }
3814 else {
3815 assert(0 && "This should never happen.");
3816 }
3817 }
3818 }
3819 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003820 *size = PyUnicode_WSTR_LENGTH(unicode);
3821 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003822}
3823
Alexander Belopolsky40018472011-02-26 01:02:56 +00003824Py_UNICODE *
3825PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828}
3829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003830
Alexander Belopolsky40018472011-02-26 01:02:56 +00003831Py_ssize_t
3832PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833{
3834 if (!PyUnicode_Check(unicode)) {
3835 PyErr_BadArgument();
3836 goto onError;
3837 }
3838 return PyUnicode_GET_SIZE(unicode);
3839
Benjamin Peterson29060642009-01-31 22:14:21 +00003840 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 return -1;
3842}
3843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844Py_ssize_t
3845PyUnicode_GetLength(PyObject *unicode)
3846{
Victor Stinner07621332012-06-16 04:53:46 +02003847 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003848 PyErr_BadArgument();
3849 return -1;
3850 }
Victor Stinner07621332012-06-16 04:53:46 +02003851 if (PyUnicode_READY(unicode) == -1)
3852 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853 return PyUnicode_GET_LENGTH(unicode);
3854}
3855
3856Py_UCS4
3857PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3858{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003859 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3860 PyErr_BadArgument();
3861 return (Py_UCS4)-1;
3862 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003863 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003864 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865 return (Py_UCS4)-1;
3866 }
3867 return PyUnicode_READ_CHAR(unicode, index);
3868}
3869
3870int
3871PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3872{
3873 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003874 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 return -1;
3876 }
Victor Stinner488fa492011-12-12 00:01:39 +01003877 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003878 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003879 PyErr_SetString(PyExc_IndexError, "string index out of range");
3880 return -1;
3881 }
Victor Stinner488fa492011-12-12 00:01:39 +01003882 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003883 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003884 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3885 PyErr_SetString(PyExc_ValueError, "character out of range");
3886 return -1;
3887 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003888 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3889 index, ch);
3890 return 0;
3891}
3892
Alexander Belopolsky40018472011-02-26 01:02:56 +00003893const char *
3894PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003895{
Victor Stinner42cb4622010-09-01 19:39:01 +00003896 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003897}
3898
Victor Stinner554f3f02010-06-16 23:33:54 +00003899/* create or adjust a UnicodeDecodeError */
3900static void
3901make_decode_exception(PyObject **exceptionObject,
3902 const char *encoding,
3903 const char *input, Py_ssize_t length,
3904 Py_ssize_t startpos, Py_ssize_t endpos,
3905 const char *reason)
3906{
3907 if (*exceptionObject == NULL) {
3908 *exceptionObject = PyUnicodeDecodeError_Create(
3909 encoding, input, length, startpos, endpos, reason);
3910 }
3911 else {
3912 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3913 goto onError;
3914 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3915 goto onError;
3916 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3917 goto onError;
3918 }
3919 return;
3920
3921onError:
3922 Py_DECREF(*exceptionObject);
3923 *exceptionObject = NULL;
3924}
3925
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003926/* error handling callback helper:
3927 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003928 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 and adjust various state variables.
3930 return 0 on success, -1 on error
3931*/
3932
Alexander Belopolsky40018472011-02-26 01:02:56 +00003933static int
3934unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003935 const char *encoding, const char *reason,
3936 const char **input, const char **inend, Py_ssize_t *startinpos,
3937 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003938 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003939{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003940 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003941
3942 PyObject *restuple = NULL;
3943 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003944 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003945 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003946 Py_ssize_t requiredsize;
3947 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003948 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949 int res = -1;
3950
Victor Stinner596a6c42011-11-09 00:02:18 +01003951 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3952 outsize = PyUnicode_GET_LENGTH(*output);
3953 else
3954 outsize = _PyUnicode_WSTR_LENGTH(*output);
3955
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003957 *errorHandler = PyCodec_LookupError(errors);
3958 if (*errorHandler == NULL)
3959 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 }
3961
Victor Stinner554f3f02010-06-16 23:33:54 +00003962 make_decode_exception(exceptionObject,
3963 encoding,
3964 *input, *inend - *input,
3965 *startinpos, *endinpos,
3966 reason);
3967 if (*exceptionObject == NULL)
3968 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969
3970 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3971 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003972 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003974 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003976 }
3977 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003978 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05003979 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003980 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003981
3982 /* Copy back the bytes variables, which might have been modified by the
3983 callback */
3984 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3985 if (!inputobj)
3986 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003987 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003988 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003989 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003990 *input = PyBytes_AS_STRING(inputobj);
3991 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003992 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003993 /* we can DECREF safely, as the exception has another reference,
3994 so the object won't go away. */
3995 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003996
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003997 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003998 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003999 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004000 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4001 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004002 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003
Victor Stinner596a6c42011-11-09 00:02:18 +01004004 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4005 /* need more space? (at least enough for what we
4006 have+the replacement+the rest of the string (starting
4007 at the new input position), so we won't have to check space
4008 when there are no errors in the rest of the string) */
4009 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4010 requiredsize = *outpos + replen + insize-newpos;
4011 if (requiredsize > outsize) {
4012 if (requiredsize<2*outsize)
4013 requiredsize = 2*outsize;
4014 if (unicode_resize(output, requiredsize) < 0)
4015 goto onError;
4016 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004017 if (unicode_widen(output, *outpos,
4018 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004019 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004020 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004021 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004022 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004023 else {
4024 wchar_t *repwstr;
4025 Py_ssize_t repwlen;
4026 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4027 if (repwstr == NULL)
4028 goto onError;
4029 /* need more space? (at least enough for what we
4030 have+the replacement+the rest of the string (starting
4031 at the new input position), so we won't have to check space
4032 when there are no errors in the rest of the string) */
4033 requiredsize = *outpos + repwlen + insize-newpos;
4034 if (requiredsize > outsize) {
4035 if (requiredsize < 2*outsize)
4036 requiredsize = 2*outsize;
4037 if (unicode_resize(output, requiredsize) < 0)
4038 goto onError;
4039 }
4040 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4041 *outpos += repwlen;
4042 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004044 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004045
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004046 /* we made it! */
4047 res = 0;
4048
Benjamin Peterson29060642009-01-31 22:14:21 +00004049 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 Py_XDECREF(restuple);
4051 return res;
4052}
4053
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004054/* --- UTF-7 Codec -------------------------------------------------------- */
4055
Antoine Pitrou244651a2009-05-04 18:56:13 +00004056/* See RFC2152 for details. We encode conservatively and decode liberally. */
4057
4058/* Three simple macros defining base-64. */
4059
4060/* Is c a base-64 character? */
4061
4062#define IS_BASE64(c) \
4063 (((c) >= 'A' && (c) <= 'Z') || \
4064 ((c) >= 'a' && (c) <= 'z') || \
4065 ((c) >= '0' && (c) <= '9') || \
4066 (c) == '+' || (c) == '/')
4067
4068/* given that c is a base-64 character, what is its base-64 value? */
4069
4070#define FROM_BASE64(c) \
4071 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4072 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4073 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4074 (c) == '+' ? 62 : 63)
4075
4076/* What is the base-64 character of the bottom 6 bits of n? */
4077
4078#define TO_BASE64(n) \
4079 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4080
4081/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4082 * decoded as itself. We are permissive on decoding; the only ASCII
4083 * byte not decoding to itself is the + which begins a base64
4084 * string. */
4085
4086#define DECODE_DIRECT(c) \
4087 ((c) <= 127 && (c) != '+')
4088
4089/* The UTF-7 encoder treats ASCII characters differently according to
4090 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4091 * the above). See RFC2152. This array identifies these different
4092 * sets:
4093 * 0 : "Set D"
4094 * alphanumeric and '(),-./:?
4095 * 1 : "Set O"
4096 * !"#$%&*;<=>@[]^_`{|}
4097 * 2 : "whitespace"
4098 * ht nl cr sp
4099 * 3 : special (must be base64 encoded)
4100 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4101 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004102
Tim Petersced69f82003-09-16 20:30:58 +00004103static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004104char utf7_category[128] = {
4105/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4106 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4107/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4108 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4109/* sp ! " # $ % & ' ( ) * + , - . / */
4110 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4111/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4113/* @ A B C D E F G H I J K L M N O */
4114 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4115/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4117/* ` a b c d e f g h i j k l m n o */
4118 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4119/* p q r s t u v w x y z { | } ~ del */
4120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004121};
4122
Antoine Pitrou244651a2009-05-04 18:56:13 +00004123/* ENCODE_DIRECT: this character should be encoded as itself. The
4124 * answer depends on whether we are encoding set O as itself, and also
4125 * on whether we are encoding whitespace as itself. RFC2152 makes it
4126 * clear that the answers to these questions vary between
4127 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004128
Antoine Pitrou244651a2009-05-04 18:56:13 +00004129#define ENCODE_DIRECT(c, directO, directWS) \
4130 ((c) < 128 && (c) > 0 && \
4131 ((utf7_category[(c)] == 0) || \
4132 (directWS && (utf7_category[(c)] == 2)) || \
4133 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004134
Alexander Belopolsky40018472011-02-26 01:02:56 +00004135PyObject *
4136PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004137 Py_ssize_t size,
4138 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004139{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004140 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4141}
4142
Antoine Pitrou244651a2009-05-04 18:56:13 +00004143/* The decoder. The only state we preserve is our read position,
4144 * i.e. how many characters we have consumed. So if we end in the
4145 * middle of a shift sequence we have to back off the read position
4146 * and the output to the beginning of the sequence, otherwise we lose
4147 * all the shift state (seen bits, number of bits seen, high
4148 * surrogate). */
4149
Alexander Belopolsky40018472011-02-26 01:02:56 +00004150PyObject *
4151PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004152 Py_ssize_t size,
4153 const char *errors,
4154 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004155{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004157 Py_ssize_t startinpos;
4158 Py_ssize_t endinpos;
4159 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004160 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004161 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004162 const char *errmsg = "";
4163 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004164 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004165 unsigned int base64bits = 0;
4166 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004167 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004168 PyObject *errorHandler = NULL;
4169 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004170
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004171 /* Start off assuming it's all ASCII. Widen later as necessary. */
4172 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004173 if (!unicode)
4174 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004175 if (size == 0) {
4176 if (consumed)
4177 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004178 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004179 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004180
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004181 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004182 e = s + size;
4183
4184 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004185 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004186 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004187 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004188
Antoine Pitrou244651a2009-05-04 18:56:13 +00004189 if (inShift) { /* in a base-64 section */
4190 if (IS_BASE64(ch)) { /* consume a base-64 character */
4191 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4192 base64bits += 6;
4193 s++;
4194 if (base64bits >= 16) {
4195 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004196 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004197 base64bits -= 16;
4198 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4199 if (surrogate) {
4200 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004201 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4202 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004203 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4204 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004205 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004206 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004207 }
4208 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004209 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4210 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004211 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004212 }
4213 }
Victor Stinner551ac952011-11-29 22:58:13 +01004214 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004215 /* first surrogate */
4216 surrogate = outCh;
4217 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004218 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004219 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4220 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004221 }
4222 }
4223 }
4224 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004225 inShift = 0;
4226 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004227 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004228 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4229 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004230 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004231 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004232 if (base64bits > 0) { /* left-over bits */
4233 if (base64bits >= 6) {
4234 /* We've seen at least one base-64 character */
4235 errmsg = "partial character in shift sequence";
4236 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004237 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004238 else {
4239 /* Some bits remain; they should be zero */
4240 if (base64buffer != 0) {
4241 errmsg = "non-zero padding bits in shift sequence";
4242 goto utf7Error;
4243 }
4244 }
4245 }
4246 if (ch != '-') {
4247 /* '-' is absorbed; other terminating
4248 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004249 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4250 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004251 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004252 }
4253 }
4254 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004256 s++; /* consume '+' */
4257 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004258 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004259 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4260 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004261 }
4262 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004263 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004264 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004265 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004266 }
4267 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004268 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004269 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4270 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004271 s++;
4272 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004273 else {
4274 startinpos = s-starts;
4275 s++;
4276 errmsg = "unexpected special character";
4277 goto utf7Error;
4278 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004279 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004280utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281 endinpos = s-starts;
4282 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 errors, &errorHandler,
4284 "utf7", errmsg,
4285 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004286 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288 }
4289
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290 /* end of string */
4291
4292 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4293 /* if we're in an inconsistent state, that's an error */
4294 if (surrogate ||
4295 (base64bits >= 6) ||
4296 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004297 endinpos = size;
4298 if (unicode_decode_call_errorhandler(
4299 errors, &errorHandler,
4300 "utf7", "unterminated shift sequence",
4301 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004302 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004303 goto onError;
4304 if (s < e)
4305 goto restart;
4306 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004307 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004308
4309 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004310 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004311 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004312 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004313 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314 }
4315 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004316 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004317 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004318 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004319
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004320 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004321 goto onError;
4322
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004323 Py_XDECREF(errorHandler);
4324 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004325 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004326
Benjamin Peterson29060642009-01-31 22:14:21 +00004327 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 Py_XDECREF(errorHandler);
4329 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330 Py_DECREF(unicode);
4331 return NULL;
4332}
4333
4334
Alexander Belopolsky40018472011-02-26 01:02:56 +00004335PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004336_PyUnicode_EncodeUTF7(PyObject *str,
4337 int base64SetO,
4338 int base64WhiteSpace,
4339 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004340{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004341 int kind;
4342 void *data;
4343 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004344 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004346 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004347 unsigned int base64bits = 0;
4348 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004349 char * out;
4350 char * start;
4351
Benjamin Petersonbac79492012-01-14 13:34:47 -05004352 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004353 return NULL;
4354 kind = PyUnicode_KIND(str);
4355 data = PyUnicode_DATA(str);
4356 len = PyUnicode_GET_LENGTH(str);
4357
4358 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004359 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004360
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004361 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004362 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004363 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004364 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004365 if (v == NULL)
4366 return NULL;
4367
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004368 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004369 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004370 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004371
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 if (inShift) {
4373 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4374 /* shifting out */
4375 if (base64bits) { /* output remaining bits */
4376 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4377 base64buffer = 0;
4378 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379 }
4380 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 /* Characters not in the BASE64 set implicitly unshift the sequence
4382 so no '-' is required, except if the character is itself a '-' */
4383 if (IS_BASE64(ch) || ch == '-') {
4384 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004385 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 *out++ = (char) ch;
4387 }
4388 else {
4389 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004390 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004391 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392 else { /* not in a shift sequence */
4393 if (ch == '+') {
4394 *out++ = '+';
4395 *out++ = '-';
4396 }
4397 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4398 *out++ = (char) ch;
4399 }
4400 else {
4401 *out++ = '+';
4402 inShift = 1;
4403 goto encode_char;
4404 }
4405 }
4406 continue;
4407encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004409 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004410
Antoine Pitrou244651a2009-05-04 18:56:13 +00004411 /* code first surrogate */
4412 base64bits += 16;
4413 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4414 while (base64bits >= 6) {
4415 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4416 base64bits -= 6;
4417 }
4418 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004419 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 base64bits += 16;
4422 base64buffer = (base64buffer << 16) | ch;
4423 while (base64bits >= 6) {
4424 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4425 base64bits -= 6;
4426 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004427 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 if (base64bits)
4429 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4430 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004431 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004432 if (_PyBytes_Resize(&v, out - start) < 0)
4433 return NULL;
4434 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004435}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004436PyObject *
4437PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4438 Py_ssize_t size,
4439 int base64SetO,
4440 int base64WhiteSpace,
4441 const char *errors)
4442{
4443 PyObject *result;
4444 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4445 if (tmp == NULL)
4446 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004447 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004448 base64WhiteSpace, errors);
4449 Py_DECREF(tmp);
4450 return result;
4451}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004452
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453#undef IS_BASE64
4454#undef FROM_BASE64
4455#undef TO_BASE64
4456#undef DECODE_DIRECT
4457#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459/* --- UTF-8 Codec -------------------------------------------------------- */
4460
Alexander Belopolsky40018472011-02-26 01:02:56 +00004461PyObject *
4462PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004463 Py_ssize_t size,
4464 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465{
Walter Dörwald69652032004-09-07 20:24:22 +00004466 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4467}
4468
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004469#include "stringlib/asciilib.h"
4470#include "stringlib/codecs.h"
4471#include "stringlib/undef.h"
4472
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004473#include "stringlib/ucs1lib.h"
4474#include "stringlib/codecs.h"
4475#include "stringlib/undef.h"
4476
4477#include "stringlib/ucs2lib.h"
4478#include "stringlib/codecs.h"
4479#include "stringlib/undef.h"
4480
4481#include "stringlib/ucs4lib.h"
4482#include "stringlib/codecs.h"
4483#include "stringlib/undef.h"
4484
Antoine Pitrouab868312009-01-10 15:40:25 +00004485/* Mask to quickly check whether a C 'long' contains a
4486 non-ASCII, UTF8-encoded char. */
4487#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004488# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004489#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004490# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004491#else
4492# error C 'long' size should be either 4 or 8!
4493#endif
4494
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004495static Py_ssize_t
4496ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004497{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004498 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004499 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004500
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004501#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004502 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4503 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004504 /* Fast path, see in STRINGLIB(utf8_decode) for
4505 an explanation. */
4506 /* Help register allocation */
4507 register const char *_p = p;
4508 register Py_UCS1 * q = dest;
4509 while (_p < aligned_end) {
4510 unsigned long value = *(const unsigned long *) _p;
4511 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004513 *((unsigned long *)q) = value;
4514 _p += SIZEOF_LONG;
4515 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004516 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004517 p = _p;
4518 while (p < end) {
4519 if ((unsigned char)*p & 0x80)
4520 break;
4521 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004523 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004525#endif
4526 while (p < end) {
4527 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4528 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004529 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004530 /* Help register allocation */
4531 register const char *_p = p;
4532 while (_p < aligned_end) {
4533 unsigned long value = *(unsigned long *) _p;
4534 if (value & ASCII_CHAR_MASK)
4535 break;
4536 _p += SIZEOF_LONG;
4537 }
4538 p = _p;
4539 if (_p == end)
4540 break;
4541 }
4542 if ((unsigned char)*p & 0x80)
4543 break;
4544 ++p;
4545 }
4546 memcpy(dest, start, p - start);
4547 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548}
Antoine Pitrouab868312009-01-10 15:40:25 +00004549
Victor Stinner785938e2011-12-11 20:09:03 +01004550PyObject *
4551PyUnicode_DecodeUTF8Stateful(const char *s,
4552 Py_ssize_t size,
4553 const char *errors,
4554 Py_ssize_t *consumed)
4555{
Victor Stinner785938e2011-12-11 20:09:03 +01004556 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004557 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004558 const char *end = s + size;
4559 Py_ssize_t outpos;
4560
4561 Py_ssize_t startinpos;
4562 Py_ssize_t endinpos;
4563 const char *errmsg = "";
4564 PyObject *errorHandler = NULL;
4565 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004566
4567 if (size == 0) {
4568 if (consumed)
4569 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004570 Py_INCREF(unicode_empty);
4571 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004572 }
4573
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004574 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4575 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004576 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004577 *consumed = 1;
4578 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004579 }
4580
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004581 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004582 if (!unicode)
4583 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004584
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004585 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4586 s += outpos;
4587 while (s < end) {
4588 Py_UCS4 ch;
4589 int kind = PyUnicode_KIND(unicode);
4590 if (kind == PyUnicode_1BYTE_KIND) {
4591 if (PyUnicode_IS_ASCII(unicode))
4592 ch = asciilib_utf8_decode(&s, end,
4593 PyUnicode_1BYTE_DATA(unicode), &outpos);
4594 else
4595 ch = ucs1lib_utf8_decode(&s, end,
4596 PyUnicode_1BYTE_DATA(unicode), &outpos);
4597 } else if (kind == PyUnicode_2BYTE_KIND) {
4598 ch = ucs2lib_utf8_decode(&s, end,
4599 PyUnicode_2BYTE_DATA(unicode), &outpos);
4600 } else {
4601 assert(kind == PyUnicode_4BYTE_KIND);
4602 ch = ucs4lib_utf8_decode(&s, end,
4603 PyUnicode_4BYTE_DATA(unicode), &outpos);
4604 }
4605
4606 switch (ch) {
4607 case 0:
4608 if (s == end || consumed)
4609 goto End;
4610 errmsg = "unexpected end of data";
4611 startinpos = s - starts;
4612 endinpos = startinpos + 1;
4613 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4614 endinpos++;
4615 break;
4616 case 1:
4617 errmsg = "invalid start byte";
4618 startinpos = s - starts;
4619 endinpos = startinpos + 1;
4620 break;
4621 case 2:
4622 errmsg = "invalid continuation byte";
4623 startinpos = s - starts;
4624 endinpos = startinpos + 1;
4625 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4626 endinpos++;
4627 break;
4628 default:
4629 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4630 goto onError;
4631 continue;
4632 }
4633
4634 if (unicode_decode_call_errorhandler(
4635 errors, &errorHandler,
4636 "utf-8", errmsg,
4637 &starts, &end, &startinpos, &endinpos, &exc, &s,
4638 &unicode, &outpos))
4639 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004640 }
4641
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004642End:
4643 if (unicode_resize(&unicode, outpos) < 0)
4644 goto onError;
4645
4646 if (consumed)
4647 *consumed = s - starts;
4648
4649 Py_XDECREF(errorHandler);
4650 Py_XDECREF(exc);
4651 assert(_PyUnicode_CheckConsistency(unicode, 1));
4652 return unicode;
4653
4654onError:
4655 Py_XDECREF(errorHandler);
4656 Py_XDECREF(exc);
4657 Py_XDECREF(unicode);
4658 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004659}
4660
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004661#ifdef __APPLE__
4662
4663/* Simplified UTF-8 decoder using surrogateescape error handler,
4664 used to decode the command line arguments on Mac OS X. */
4665
4666wchar_t*
4667_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4668{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004669 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004670 wchar_t *unicode;
4671 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004672
4673 /* Note: size will always be longer than the resulting Unicode
4674 character count */
4675 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4676 PyErr_NoMemory();
4677 return NULL;
4678 }
4679 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4680 if (!unicode)
4681 return NULL;
4682
4683 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004684 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004685 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004686 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004687 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004688#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004689 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004690#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004691 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004692#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693 if (ch > 0xFF) {
4694#if SIZEOF_WCHAR_T == 4
4695 assert(0);
4696#else
4697 assert(Py_UNICODE_IS_SURROGATE(ch));
4698 /* compute and append the two surrogates: */
4699 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4700 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4701#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004702 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004703 else {
4704 if (!ch && s == e)
4705 break;
4706 /* surrogateescape */
4707 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4708 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004709 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004710 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004711 return unicode;
4712}
4713
4714#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004716/* Primary internal function which creates utf8 encoded bytes objects.
4717
4718 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004719 and allocate exactly as much space needed at the end. Else allocate the
4720 maximum possible needed (4 result bytes per Unicode character), and return
4721 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004722*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004723PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004724_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725{
Victor Stinner6099a032011-12-18 14:22:26 +01004726 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004727 void *data;
4728 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004730 if (!PyUnicode_Check(unicode)) {
4731 PyErr_BadArgument();
4732 return NULL;
4733 }
4734
4735 if (PyUnicode_READY(unicode) == -1)
4736 return NULL;
4737
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004738 if (PyUnicode_UTF8(unicode))
4739 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4740 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004741
4742 kind = PyUnicode_KIND(unicode);
4743 data = PyUnicode_DATA(unicode);
4744 size = PyUnicode_GET_LENGTH(unicode);
4745
Benjamin Petersonead6b532011-12-20 17:23:42 -06004746 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004747 default:
4748 assert(0);
4749 case PyUnicode_1BYTE_KIND:
4750 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4751 assert(!PyUnicode_IS_ASCII(unicode));
4752 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4753 case PyUnicode_2BYTE_KIND:
4754 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4755 case PyUnicode_4BYTE_KIND:
4756 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004757 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758}
4759
Alexander Belopolsky40018472011-02-26 01:02:56 +00004760PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004761PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4762 Py_ssize_t size,
4763 const char *errors)
4764{
4765 PyObject *v, *unicode;
4766
4767 unicode = PyUnicode_FromUnicode(s, size);
4768 if (unicode == NULL)
4769 return NULL;
4770 v = _PyUnicode_AsUTF8String(unicode, errors);
4771 Py_DECREF(unicode);
4772 return v;
4773}
4774
4775PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004776PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004778 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779}
4780
Walter Dörwald41980ca2007-08-16 21:55:45 +00004781/* --- UTF-32 Codec ------------------------------------------------------- */
4782
4783PyObject *
4784PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004785 Py_ssize_t size,
4786 const char *errors,
4787 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004788{
4789 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4790}
4791
4792PyObject *
4793PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 Py_ssize_t size,
4795 const char *errors,
4796 int *byteorder,
4797 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004798{
4799 const char *starts = s;
4800 Py_ssize_t startinpos;
4801 Py_ssize_t endinpos;
4802 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004803 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004804 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004805 int bo = 0; /* assume native ordering by default */
4806 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004807 /* Offsets from q for retrieving bytes in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02004808#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00004809 int iorder[] = {0, 1, 2, 3};
4810#else
4811 int iorder[] = {3, 2, 1, 0};
4812#endif
4813 PyObject *errorHandler = NULL;
4814 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004815
Walter Dörwald41980ca2007-08-16 21:55:45 +00004816 q = (unsigned char *)s;
4817 e = q + size;
4818
4819 if (byteorder)
4820 bo = *byteorder;
4821
4822 /* Check for BOM marks (U+FEFF) in the input and adjust current
4823 byte order setting accordingly. In native mode, the leading BOM
4824 mark is skipped, in all other modes, it is copied to the output
4825 stream as-is (giving a ZWNBSP character). */
4826 if (bo == 0) {
4827 if (size >= 4) {
4828 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004829 (q[iorder[1]] << 8) | q[iorder[0]];
Christian Heimes743e0cd2012-10-17 23:52:17 +02004830#if PY_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 if (bom == 0x0000FEFF) {
4832 q += 4;
4833 bo = -1;
4834 }
4835 else if (bom == 0xFFFE0000) {
4836 q += 4;
4837 bo = 1;
4838 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004839#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 if (bom == 0x0000FEFF) {
4841 q += 4;
4842 bo = 1;
4843 }
4844 else if (bom == 0xFFFE0000) {
4845 q += 4;
4846 bo = -1;
4847 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004848#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004850 }
4851
4852 if (bo == -1) {
4853 /* force LE */
4854 iorder[0] = 0;
4855 iorder[1] = 1;
4856 iorder[2] = 2;
4857 iorder[3] = 3;
4858 }
4859 else if (bo == 1) {
4860 /* force BE */
4861 iorder[0] = 3;
4862 iorder[1] = 2;
4863 iorder[2] = 1;
4864 iorder[3] = 0;
4865 }
4866
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004867 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004868 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004869 if (!unicode)
4870 return NULL;
4871 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01004872 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004873 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004874
Walter Dörwald41980ca2007-08-16 21:55:45 +00004875 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004876 Py_UCS4 ch;
4877 /* remaining bytes at the end? (size should be divisible by 4) */
4878 if (e-q<4) {
4879 if (consumed)
4880 break;
4881 errmsg = "truncated data";
4882 startinpos = ((const char *)q)-starts;
4883 endinpos = ((const char *)e)-starts;
4884 goto utf32Error;
4885 /* The remaining input chars are ignored if the callback
4886 chooses to skip the input */
4887 }
4888 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4889 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004890
Benjamin Peterson29060642009-01-31 22:14:21 +00004891 if (ch >= 0x110000)
4892 {
4893 errmsg = "codepoint not in range(0x110000)";
4894 startinpos = ((const char *)q)-starts;
4895 endinpos = startinpos+4;
4896 goto utf32Error;
4897 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004898 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4899 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 q += 4;
4901 continue;
4902 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 if (unicode_decode_call_errorhandler(
4904 errors, &errorHandler,
4905 "utf32", errmsg,
4906 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004907 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004908 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004909 }
4910
4911 if (byteorder)
4912 *byteorder = bo;
4913
4914 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004915 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004916
4917 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01004918 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004919 goto onError;
4920
4921 Py_XDECREF(errorHandler);
4922 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004923 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004924
Benjamin Peterson29060642009-01-31 22:14:21 +00004925 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004926 Py_DECREF(unicode);
4927 Py_XDECREF(errorHandler);
4928 Py_XDECREF(exc);
4929 return NULL;
4930}
4931
4932PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004933_PyUnicode_EncodeUTF32(PyObject *str,
4934 const char *errors,
4935 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004936{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004937 int kind;
4938 void *data;
4939 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004940 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004941 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004942 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004943 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02004944#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945 int iorder[] = {0, 1, 2, 3};
4946#else
4947 int iorder[] = {3, 2, 1, 0};
4948#endif
4949
Benjamin Peterson29060642009-01-31 22:14:21 +00004950#define STORECHAR(CH) \
4951 do { \
4952 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4953 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4954 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4955 p[iorder[0]] = (CH) & 0xff; \
4956 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957 } while(0)
4958
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004959 if (!PyUnicode_Check(str)) {
4960 PyErr_BadArgument();
4961 return NULL;
4962 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004963 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004964 return NULL;
4965 kind = PyUnicode_KIND(str);
4966 data = PyUnicode_DATA(str);
4967 len = PyUnicode_GET_LENGTH(str);
4968
4969 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004970 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004972 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004973 if (v == NULL)
4974 return NULL;
4975
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004976 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004978 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004979 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004980 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004981
4982 if (byteorder == -1) {
4983 /* force LE */
4984 iorder[0] = 0;
4985 iorder[1] = 1;
4986 iorder[2] = 2;
4987 iorder[3] = 3;
4988 }
4989 else if (byteorder == 1) {
4990 /* force BE */
4991 iorder[0] = 3;
4992 iorder[1] = 2;
4993 iorder[2] = 1;
4994 iorder[3] = 0;
4995 }
4996
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004997 for (i = 0; i < len; i++)
4998 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00004999
5000 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005001 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005002#undef STORECHAR
5003}
5004
Alexander Belopolsky40018472011-02-26 01:02:56 +00005005PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005006PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5007 Py_ssize_t size,
5008 const char *errors,
5009 int byteorder)
5010{
5011 PyObject *result;
5012 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5013 if (tmp == NULL)
5014 return NULL;
5015 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5016 Py_DECREF(tmp);
5017 return result;
5018}
5019
5020PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005021PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005022{
Victor Stinnerb960b342011-11-20 19:12:52 +01005023 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005024}
5025
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026/* --- UTF-16 Codec ------------------------------------------------------- */
5027
Tim Peters772747b2001-08-09 22:21:55 +00005028PyObject *
5029PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 Py_ssize_t size,
5031 const char *errors,
5032 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033{
Walter Dörwald69652032004-09-07 20:24:22 +00005034 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5035}
5036
5037PyObject *
5038PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 Py_ssize_t size,
5040 const char *errors,
5041 int *byteorder,
5042 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005043{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005044 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005045 Py_ssize_t startinpos;
5046 Py_ssize_t endinpos;
5047 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005048 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005049 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005050 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005051 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005052 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005053 PyObject *errorHandler = NULL;
5054 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055
Tim Peters772747b2001-08-09 22:21:55 +00005056 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005057 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058
5059 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005060 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005062 /* Check for BOM marks (U+FEFF) in the input and adjust current
5063 byte order setting accordingly. In native mode, the leading BOM
5064 mark is skipped, in all other modes, it is copied to the output
5065 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005066 if (bo == 0 && size >= 2) {
5067 const Py_UCS4 bom = (q[1] << 8) | q[0];
5068 if (bom == 0xFEFF) {
5069 q += 2;
5070 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005071 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005072 else if (bom == 0xFFFE) {
5073 q += 2;
5074 bo = 1;
5075 }
5076 if (byteorder)
5077 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079
Antoine Pitrou63065d72012-05-15 23:48:04 +02005080 if (q == e) {
5081 if (consumed)
5082 *consumed = size;
5083 Py_INCREF(unicode_empty);
5084 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005085 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005086
Christian Heimes743e0cd2012-10-17 23:52:17 +02005087#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005088 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005089#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005090 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005091#endif
Tim Peters772747b2001-08-09 22:21:55 +00005092
Antoine Pitrou63065d72012-05-15 23:48:04 +02005093 /* Note: size will always be longer than the resulting Unicode
5094 character count */
5095 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5096 if (!unicode)
5097 return NULL;
5098
5099 outpos = 0;
5100 while (1) {
5101 Py_UCS4 ch = 0;
5102 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005103 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005104 if (kind == PyUnicode_1BYTE_KIND) {
5105 if (PyUnicode_IS_ASCII(unicode))
5106 ch = asciilib_utf16_decode(&q, e,
5107 PyUnicode_1BYTE_DATA(unicode), &outpos,
5108 native_ordering);
5109 else
5110 ch = ucs1lib_utf16_decode(&q, e,
5111 PyUnicode_1BYTE_DATA(unicode), &outpos,
5112 native_ordering);
5113 } else if (kind == PyUnicode_2BYTE_KIND) {
5114 ch = ucs2lib_utf16_decode(&q, e,
5115 PyUnicode_2BYTE_DATA(unicode), &outpos,
5116 native_ordering);
5117 } else {
5118 assert(kind == PyUnicode_4BYTE_KIND);
5119 ch = ucs4lib_utf16_decode(&q, e,
5120 PyUnicode_4BYTE_DATA(unicode), &outpos,
5121 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005122 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005123 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005124
Antoine Pitrou63065d72012-05-15 23:48:04 +02005125 switch (ch)
5126 {
5127 case 0:
5128 /* remaining byte at the end? (size should be even) */
5129 if (q == e || consumed)
5130 goto End;
5131 errmsg = "truncated data";
5132 startinpos = ((const char *)q) - starts;
5133 endinpos = ((const char *)e) - starts;
5134 break;
5135 /* The remaining input chars are ignored if the callback
5136 chooses to skip the input */
5137 case 1:
5138 errmsg = "unexpected end of data";
5139 startinpos = ((const char *)q) - 2 - starts;
5140 endinpos = ((const char *)e) - starts;
5141 break;
5142 case 2:
5143 errmsg = "illegal encoding";
5144 startinpos = ((const char *)q) - 2 - starts;
5145 endinpos = startinpos + 2;
5146 break;
5147 case 3:
5148 errmsg = "illegal UTF-16 surrogate";
5149 startinpos = ((const char *)q) - 4 - starts;
5150 endinpos = startinpos + 2;
5151 break;
5152 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005153 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5154 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005155 continue;
5156 }
5157
Benjamin Peterson29060642009-01-31 22:14:21 +00005158 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005159 errors,
5160 &errorHandler,
5161 "utf16", errmsg,
5162 &starts,
5163 (const char **)&e,
5164 &startinpos,
5165 &endinpos,
5166 &exc,
5167 (const char **)&q,
5168 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005169 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 }
5172
Antoine Pitrou63065d72012-05-15 23:48:04 +02005173End:
Walter Dörwald69652032004-09-07 20:24:22 +00005174 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005175 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005176
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005178 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 goto onError;
5180
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005181 Py_XDECREF(errorHandler);
5182 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005183 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184
Benjamin Peterson29060642009-01-31 22:14:21 +00005185 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005187 Py_XDECREF(errorHandler);
5188 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189 return NULL;
5190}
5191
Tim Peters772747b2001-08-09 22:21:55 +00005192PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005193_PyUnicode_EncodeUTF16(PyObject *str,
5194 const char *errors,
5195 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005197 enum PyUnicode_Kind kind;
5198 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005199 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005200 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005201 unsigned short *out;
5202 Py_ssize_t bytesize;
5203 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005204#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005205 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005206#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005207 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005208#endif
5209
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005210 if (!PyUnicode_Check(str)) {
5211 PyErr_BadArgument();
5212 return NULL;
5213 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005214 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005215 return NULL;
5216 kind = PyUnicode_KIND(str);
5217 data = PyUnicode_DATA(str);
5218 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005219
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005220 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005221 if (kind == PyUnicode_4BYTE_KIND) {
5222 const Py_UCS4 *in = (const Py_UCS4 *)data;
5223 const Py_UCS4 *end = in + len;
5224 while (in < end)
5225 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005226 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005227 }
5228 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005230 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005231 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 if (v == NULL)
5233 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005235 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005236 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005237 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005239 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005240 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005241 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005242
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005243 switch (kind) {
5244 case PyUnicode_1BYTE_KIND: {
5245 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5246 break;
Tim Peters772747b2001-08-09 22:21:55 +00005247 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005248 case PyUnicode_2BYTE_KIND: {
5249 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5250 break;
Tim Peters772747b2001-08-09 22:21:55 +00005251 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005252 case PyUnicode_4BYTE_KIND: {
5253 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5254 break;
5255 }
5256 default:
5257 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005258 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005259
5260 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005261 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262}
5263
Alexander Belopolsky40018472011-02-26 01:02:56 +00005264PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005265PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5266 Py_ssize_t size,
5267 const char *errors,
5268 int byteorder)
5269{
5270 PyObject *result;
5271 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5272 if (tmp == NULL)
5273 return NULL;
5274 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5275 Py_DECREF(tmp);
5276 return result;
5277}
5278
5279PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005280PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005282 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283}
5284
5285/* --- Unicode Escape Codec ----------------------------------------------- */
5286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005287/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5288 if all the escapes in the string make it still a valid ASCII string.
5289 Returns -1 if any escapes were found which cause the string to
5290 pop out of ASCII range. Otherwise returns the length of the
5291 required buffer to hold the string.
5292 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005293static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005294length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5295{
5296 const unsigned char *p = (const unsigned char *)s;
5297 const unsigned char *end = p + size;
5298 Py_ssize_t length = 0;
5299
5300 if (size < 0)
5301 return -1;
5302
5303 for (; p < end; ++p) {
5304 if (*p > 127) {
5305 /* Non-ASCII */
5306 return -1;
5307 }
5308 else if (*p != '\\') {
5309 /* Normal character */
5310 ++length;
5311 }
5312 else {
5313 /* Backslash-escape, check next char */
5314 ++p;
5315 /* Escape sequence reaches till end of string or
5316 non-ASCII follow-up. */
5317 if (p >= end || *p > 127)
5318 return -1;
5319 switch (*p) {
5320 case '\n':
5321 /* backslash + \n result in zero characters */
5322 break;
5323 case '\\': case '\'': case '\"':
5324 case 'b': case 'f': case 't':
5325 case 'n': case 'r': case 'v': case 'a':
5326 ++length;
5327 break;
5328 case '0': case '1': case '2': case '3':
5329 case '4': case '5': case '6': case '7':
5330 case 'x': case 'u': case 'U': case 'N':
5331 /* these do not guarantee ASCII characters */
5332 return -1;
5333 default:
5334 /* count the backslash + the other character */
5335 length += 2;
5336 }
5337 }
5338 }
5339 return length;
5340}
5341
Fredrik Lundh06d12682001-01-24 07:59:11 +00005342static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005343
Alexander Belopolsky40018472011-02-26 01:02:56 +00005344PyObject *
5345PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005346 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005347 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005349 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005350 Py_ssize_t startinpos;
5351 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005352 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005353 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005355 char* message;
5356 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005357 PyObject *errorHandler = NULL;
5358 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005359 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005360 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005361
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005362 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005363
5364 /* After length_of_escaped_ascii_string() there are two alternatives,
5365 either the string is pure ASCII with named escapes like \n, etc.
5366 and we determined it's exact size (common case)
5367 or it contains \x, \u, ... escape sequences. then we create a
5368 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005369 if (len >= 0) {
5370 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005371 if (!v)
5372 goto onError;
5373 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005374 }
5375 else {
5376 /* Escaped strings will always be longer than the resulting
5377 Unicode string, so we start with size here and then reduce the
5378 length after conversion to the true value.
5379 (but if the error callback returns a long replacement string
5380 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005381 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005382 if (!v)
5383 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005384 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005385 }
5386
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005388 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005389 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005391
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 while (s < end) {
5393 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005394 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005395 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005397 /* The only case in which i == ascii_length is a backslash
5398 followed by a newline. */
5399 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005400
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 /* Non-escape characters are interpreted as Unicode ordinals */
5402 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005403 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5404 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 continue;
5406 }
5407
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005408 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 /* \ - Escapes */
5410 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005411 c = *s++;
5412 if (s > end)
5413 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005414
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005415 /* The only case in which i == ascii_length is a backslash
5416 followed by a newline. */
5417 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005418
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005419 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005422#define WRITECHAR(ch) \
5423 do { \
5424 if (unicode_putchar(&v, &i, ch) < 0) \
5425 goto onError; \
5426 }while(0)
5427
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005429 case '\\': WRITECHAR('\\'); break;
5430 case '\'': WRITECHAR('\''); break;
5431 case '\"': WRITECHAR('\"'); break;
5432 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005433 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005434 case 'f': WRITECHAR('\014'); break;
5435 case 't': WRITECHAR('\t'); break;
5436 case 'n': WRITECHAR('\n'); break;
5437 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005438 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005439 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005440 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005441 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442
Benjamin Peterson29060642009-01-31 22:14:21 +00005443 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 case '0': case '1': case '2': case '3':
5445 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005446 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005447 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005448 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005449 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005450 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005452 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 break;
5454
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 /* hex escapes */
5456 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005458 digits = 2;
5459 message = "truncated \\xXX escape";
5460 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461
Benjamin Peterson29060642009-01-31 22:14:21 +00005462 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005464 digits = 4;
5465 message = "truncated \\uXXXX escape";
5466 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005469 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005470 digits = 8;
5471 message = "truncated \\UXXXXXXXX escape";
5472 hexescape:
5473 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005474 if (s+digits>end) {
5475 endinpos = size;
5476 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005477 errors, &errorHandler,
5478 "unicodeescape", "end of string in escape sequence",
5479 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005480 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005481 goto onError;
5482 goto nextByte;
5483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005484 for (j = 0; j < digits; ++j) {
5485 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005486 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005487 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 errors, &errorHandler,
5490 "unicodeescape", message,
5491 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005492 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005493 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005494 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005495 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005496 }
5497 chr = (chr<<4) & ~0xF;
5498 if (c >= '0' && c <= '9')
5499 chr += c - '0';
5500 else if (c >= 'a' && c <= 'f')
5501 chr += 10 + c - 'a';
5502 else
5503 chr += 10 + c - 'A';
5504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005505 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005506 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005507 /* _decoding_error will have already written into the
5508 target buffer. */
5509 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005510 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005511 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005512 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005513 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005514 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005516 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 errors, &errorHandler,
5518 "unicodeescape", "illegal Unicode character",
5519 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005520 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005521 goto onError;
5522 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005523 break;
5524
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005526 case 'N':
5527 message = "malformed \\N character escape";
5528 if (ucnhash_CAPI == NULL) {
5529 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005530 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5531 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005532 if (ucnhash_CAPI == NULL)
5533 goto ucnhashError;
5534 }
5535 if (*s == '{') {
5536 const char *start = s+1;
5537 /* look for the closing brace */
5538 while (*s != '}' && s < end)
5539 s++;
5540 if (s > start && s < end && *s == '}') {
5541 /* found a name. look it up in the unicode database */
5542 message = "unknown Unicode character name";
5543 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005544 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005545 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005546 goto store;
5547 }
5548 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005549 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005550 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 errors, &errorHandler,
5552 "unicodeescape", message,
5553 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005554 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005555 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005556 break;
5557
5558 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005559 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005560 message = "\\ at end of string";
5561 s--;
5562 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 errors, &errorHandler,
5565 "unicodeescape", message,
5566 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005567 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005568 goto onError;
5569 }
5570 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005571 WRITECHAR('\\');
5572 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005573 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005574 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005577 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005579#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005580
Victor Stinner16e6a802011-12-12 13:24:15 +01005581 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005582 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005583 Py_XDECREF(errorHandler);
5584 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005585 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005586
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005588 PyErr_SetString(
5589 PyExc_UnicodeError,
5590 "\\N escapes not supported (can't load unicodedata module)"
5591 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005592 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005593 Py_XDECREF(errorHandler);
5594 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005595 return NULL;
5596
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005599 Py_XDECREF(errorHandler);
5600 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 return NULL;
5602}
5603
5604/* Return a Unicode-Escape string version of the Unicode object.
5605
5606 If quotes is true, the string is enclosed in u"" or u'' quotes as
5607 appropriate.
5608
5609*/
5610
Alexander Belopolsky40018472011-02-26 01:02:56 +00005611PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005612PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005614 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005615 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005617 int kind;
5618 void *data;
5619 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620
Ezio Melottie7f90372012-10-05 03:33:31 +03005621 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005622 escape.
5623
Ezio Melottie7f90372012-10-05 03:33:31 +03005624 For UCS1 strings it's '\xxx', 4 bytes per source character.
5625 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5626 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005627 */
5628
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005629 if (!PyUnicode_Check(unicode)) {
5630 PyErr_BadArgument();
5631 return NULL;
5632 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005633 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005634 return NULL;
5635 len = PyUnicode_GET_LENGTH(unicode);
5636 kind = PyUnicode_KIND(unicode);
5637 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005638 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005639 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5640 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5641 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5642 }
5643
5644 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005645 return PyBytes_FromStringAndSize(NULL, 0);
5646
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005647 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005649
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005650 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005652 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 if (repr == NULL)
5655 return NULL;
5656
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005657 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005659 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005660 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005661
Walter Dörwald79e913e2007-05-12 11:08:06 +00005662 /* Escape backslashes */
5663 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 *p++ = '\\';
5665 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005666 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005667 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005668
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005669 /* Map 21-bit characters to '\U00xxxxxx' */
5670 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005671 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005672 *p++ = '\\';
5673 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005674 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5675 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5676 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5677 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5678 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5679 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5680 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5681 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005683 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005684
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005686 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 *p++ = '\\';
5688 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005689 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5690 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5691 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5692 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005694
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005695 /* Map special whitespace to '\t', \n', '\r' */
5696 else if (ch == '\t') {
5697 *p++ = '\\';
5698 *p++ = 't';
5699 }
5700 else if (ch == '\n') {
5701 *p++ = '\\';
5702 *p++ = 'n';
5703 }
5704 else if (ch == '\r') {
5705 *p++ = '\\';
5706 *p++ = 'r';
5707 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005708
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005709 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005710 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005712 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005713 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5714 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005715 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005716
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 /* Copy everything else as-is */
5718 else
5719 *p++ = (char) ch;
5720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005722 assert(p - PyBytes_AS_STRING(repr) > 0);
5723 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5724 return NULL;
5725 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726}
5727
Alexander Belopolsky40018472011-02-26 01:02:56 +00005728PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005729PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5730 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005732 PyObject *result;
5733 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5734 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005736 result = PyUnicode_AsUnicodeEscapeString(tmp);
5737 Py_DECREF(tmp);
5738 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739}
5740
5741/* --- Raw Unicode Escape Codec ------------------------------------------- */
5742
Alexander Belopolsky40018472011-02-26 01:02:56 +00005743PyObject *
5744PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005745 Py_ssize_t size,
5746 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005748 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005749 Py_ssize_t startinpos;
5750 Py_ssize_t endinpos;
5751 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005752 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 const char *end;
5754 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755 PyObject *errorHandler = NULL;
5756 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005757
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 /* Escaped strings will always be longer than the resulting
5759 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005760 length after conversion to the true value. (But decoding error
5761 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005762 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005766 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005767 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 end = s + size;
5769 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 unsigned char c;
5771 Py_UCS4 x;
5772 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005773 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 /* Non-escape characters are interpreted as Unicode ordinals */
5776 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005777 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5778 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005780 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 startinpos = s-starts;
5782
5783 /* \u-escapes are only interpreted iff the number of leading
5784 backslashes if odd */
5785 bs = s;
5786 for (;s < end;) {
5787 if (*s != '\\')
5788 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005789 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5790 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 }
5792 if (((s - bs) & 1) == 0 ||
5793 s >= end ||
5794 (*s != 'u' && *s != 'U')) {
5795 continue;
5796 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005797 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 count = *s=='u' ? 4 : 8;
5799 s++;
5800
5801 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 for (x = 0, i = 0; i < count; ++i, ++s) {
5803 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005804 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 endinpos = s-starts;
5806 if (unicode_decode_call_errorhandler(
5807 errors, &errorHandler,
5808 "rawunicodeescape", "truncated \\uXXXX",
5809 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005810 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 goto onError;
5812 goto nextByte;
5813 }
5814 x = (x<<4) & ~0xF;
5815 if (c >= '0' && c <= '9')
5816 x += c - '0';
5817 else if (c >= 'a' && c <= 'f')
5818 x += 10 + c - 'a';
5819 else
5820 x += 10 + c - 'A';
5821 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005822 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005823 if (unicode_putchar(&v, &outpos, x) < 0)
5824 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005825 } else {
5826 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005827 if (unicode_decode_call_errorhandler(
5828 errors, &errorHandler,
5829 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005831 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005833 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 nextByte:
5835 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 }
Victor Stinner16e6a802011-12-12 13:24:15 +01005837 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 Py_XDECREF(errorHandler);
5840 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005841 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00005842
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 Py_XDECREF(errorHandler);
5846 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 return NULL;
5848}
5849
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005850
Alexander Belopolsky40018472011-02-26 01:02:56 +00005851PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005852PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005854 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 char *p;
5856 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005857 Py_ssize_t expandsize, pos;
5858 int kind;
5859 void *data;
5860 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005862 if (!PyUnicode_Check(unicode)) {
5863 PyErr_BadArgument();
5864 return NULL;
5865 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005866 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005867 return NULL;
5868 kind = PyUnicode_KIND(unicode);
5869 data = PyUnicode_DATA(unicode);
5870 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005871 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5872 bytes, and 1 byte characters 4. */
5873 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005874
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005875 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005877
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005878 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 if (repr == NULL)
5880 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005881 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005882 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005884 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005885 for (pos = 0; pos < len; pos++) {
5886 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 /* Map 32-bit characters to '\Uxxxxxxxx' */
5888 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005889 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005890 *p++ = '\\';
5891 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005892 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5893 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5894 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5895 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5896 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5897 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5898 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5899 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005900 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 *p++ = '\\';
5904 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005905 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5906 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5907 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5908 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 /* Copy everything else as-is */
5911 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 *p++ = (char) ch;
5913 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005914
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005915 assert(p > q);
5916 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005917 return NULL;
5918 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919}
5920
Alexander Belopolsky40018472011-02-26 01:02:56 +00005921PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005922PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5923 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005925 PyObject *result;
5926 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5927 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005928 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005929 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5930 Py_DECREF(tmp);
5931 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932}
5933
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005934/* --- Unicode Internal Codec ------------------------------------------- */
5935
Alexander Belopolsky40018472011-02-26 01:02:56 +00005936PyObject *
5937_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005938 Py_ssize_t size,
5939 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005940{
5941 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005942 Py_ssize_t startinpos;
5943 Py_ssize_t endinpos;
5944 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005945 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005946 const char *end;
5947 const char *reason;
5948 PyObject *errorHandler = NULL;
5949 PyObject *exc = NULL;
5950
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005951 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005952 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005953 1))
5954 return NULL;
5955
Thomas Wouters89f507f2006-12-13 04:49:30 +00005956 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005957 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005958 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005960 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005961 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005962 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005963 end = s + size;
5964
5965 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005966 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005967 Py_UCS4 ch;
5968 /* We copy the raw representation one byte at a time because the
5969 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005970 ((char *) &uch)[0] = s[0];
5971 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005972#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005973 ((char *) &uch)[2] = s[2];
5974 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005975#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005976 ch = uch;
5977
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005978 /* We have to sanity check the raw data, otherwise doom looms for
5979 some malformed UCS-4 data. */
5980 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005981#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005982 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005983#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005984 end-s < Py_UNICODE_SIZE
5985 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005987 startinpos = s - starts;
5988 if (end-s < Py_UNICODE_SIZE) {
5989 endinpos = end-starts;
5990 reason = "truncated input";
5991 }
5992 else {
5993 endinpos = s - starts + Py_UNICODE_SIZE;
5994 reason = "illegal code point (> 0x10FFFF)";
5995 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005996 if (unicode_decode_call_errorhandler(
5997 errors, &errorHandler,
5998 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005999 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006000 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006001 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006002 continue;
6003 }
6004
6005 s += Py_UNICODE_SIZE;
6006#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006007 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006008 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006009 Py_UNICODE uch2;
6010 ((char *) &uch2)[0] = s[0];
6011 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006012 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006013 {
Victor Stinner551ac952011-11-29 22:58:13 +01006014 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006015 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006016 }
6017 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006018#endif
6019
6020 if (unicode_putchar(&v, &outpos, ch) < 0)
6021 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006022 }
6023
Victor Stinner16e6a802011-12-12 13:24:15 +01006024 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006025 goto onError;
6026 Py_XDECREF(errorHandler);
6027 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006028 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006029
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006031 Py_XDECREF(v);
6032 Py_XDECREF(errorHandler);
6033 Py_XDECREF(exc);
6034 return NULL;
6035}
6036
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037/* --- Latin-1 Codec ------------------------------------------------------ */
6038
Alexander Belopolsky40018472011-02-26 01:02:56 +00006039PyObject *
6040PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006041 Py_ssize_t size,
6042 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006045 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046}
6047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006048/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006049static void
6050make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006051 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006052 PyObject *unicode,
6053 Py_ssize_t startpos, Py_ssize_t endpos,
6054 const char *reason)
6055{
6056 if (*exceptionObject == NULL) {
6057 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006058 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006059 encoding, unicode, startpos, endpos, reason);
6060 }
6061 else {
6062 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6063 goto onError;
6064 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6065 goto onError;
6066 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6067 goto onError;
6068 return;
6069 onError:
6070 Py_DECREF(*exceptionObject);
6071 *exceptionObject = NULL;
6072 }
6073}
6074
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006075/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006076static void
6077raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006078 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006079 PyObject *unicode,
6080 Py_ssize_t startpos, Py_ssize_t endpos,
6081 const char *reason)
6082{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006083 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006084 encoding, unicode, startpos, endpos, reason);
6085 if (*exceptionObject != NULL)
6086 PyCodec_StrictErrors(*exceptionObject);
6087}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006088
6089/* error handling callback helper:
6090 build arguments, call the callback and check the arguments,
6091 put the result into newpos and return the replacement string, which
6092 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006093static PyObject *
6094unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006095 PyObject **errorHandler,
6096 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006097 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006098 Py_ssize_t startpos, Py_ssize_t endpos,
6099 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006100{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006101 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006102 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006103 PyObject *restuple;
6104 PyObject *resunicode;
6105
6106 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006110 }
6111
Benjamin Petersonbac79492012-01-14 13:34:47 -05006112 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006113 return NULL;
6114 len = PyUnicode_GET_LENGTH(unicode);
6115
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006116 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006117 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006118 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120
6121 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006123 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006125 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006126 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 Py_DECREF(restuple);
6128 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006130 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 &resunicode, newpos)) {
6132 Py_DECREF(restuple);
6133 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006135 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6136 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6137 Py_DECREF(restuple);
6138 return NULL;
6139 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006141 *newpos = len + *newpos;
6142 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6144 Py_DECREF(restuple);
6145 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006146 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 Py_INCREF(resunicode);
6148 Py_DECREF(restuple);
6149 return resunicode;
6150}
6151
Alexander Belopolsky40018472011-02-26 01:02:56 +00006152static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006153unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006154 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006155 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006156{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006157 /* input state */
6158 Py_ssize_t pos=0, size;
6159 int kind;
6160 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006161 /* output object */
6162 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006163 /* pointer into the output */
6164 char *str;
6165 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006166 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006167 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6168 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006169 PyObject *errorHandler = NULL;
6170 PyObject *exc = NULL;
6171 /* the following variable is used for caching string comparisons
6172 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6173 int known_errorHandler = -1;
6174
Benjamin Petersonbac79492012-01-14 13:34:47 -05006175 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006176 return NULL;
6177 size = PyUnicode_GET_LENGTH(unicode);
6178 kind = PyUnicode_KIND(unicode);
6179 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006180 /* allocate enough for a simple encoding without
6181 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006182 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006183 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006184 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006186 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006187 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006188 ressize = size;
6189
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006190 while (pos < size) {
6191 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 /* can we encode this? */
6194 if (c<limit) {
6195 /* no overflow check, because we know that the space is enough */
6196 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006197 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006198 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 Py_ssize_t requiredsize;
6201 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006202 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006204 Py_ssize_t collstart = pos;
6205 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006207 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 ++collend;
6209 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6210 if (known_errorHandler==-1) {
6211 if ((errors==NULL) || (!strcmp(errors, "strict")))
6212 known_errorHandler = 1;
6213 else if (!strcmp(errors, "replace"))
6214 known_errorHandler = 2;
6215 else if (!strcmp(errors, "ignore"))
6216 known_errorHandler = 3;
6217 else if (!strcmp(errors, "xmlcharrefreplace"))
6218 known_errorHandler = 4;
6219 else
6220 known_errorHandler = 0;
6221 }
6222 switch (known_errorHandler) {
6223 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006224 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006225 goto onError;
6226 case 2: /* replace */
6227 while (collstart++<collend)
6228 *str++ = '?'; /* fall through */
6229 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006230 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 break;
6232 case 4: /* xmlcharrefreplace */
6233 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006234 /* determine replacement size */
6235 for (i = collstart, repsize = 0; i < collend; ++i) {
6236 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6237 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006239 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006241 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006243 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006245 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006247 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006249 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006250 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006252 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006254 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 if (requiredsize > ressize) {
6256 if (requiredsize<2*ressize)
6257 requiredsize = 2*ressize;
6258 if (_PyBytes_Resize(&res, requiredsize))
6259 goto onError;
6260 str = PyBytes_AS_STRING(res) + respos;
6261 ressize = requiredsize;
6262 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006263 /* generate replacement */
6264 for (i = collstart; i < collend; ++i) {
6265 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006267 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 break;
6269 default:
6270 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006271 encoding, reason, unicode, &exc,
6272 collstart, collend, &newpos);
6273 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006274 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006276 if (PyBytes_Check(repunicode)) {
6277 /* Directly copy bytes result to output. */
6278 repsize = PyBytes_Size(repunicode);
6279 if (repsize > 1) {
6280 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006281 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006282 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6283 Py_DECREF(repunicode);
6284 goto onError;
6285 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006286 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006287 ressize += repsize-1;
6288 }
6289 memcpy(str, PyBytes_AsString(repunicode), repsize);
6290 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006291 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006292 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006293 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006294 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 /* need more space? (at least enough for what we
6296 have+the replacement+the rest of the string, so
6297 we won't have to check space for encodable characters) */
6298 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006299 repsize = PyUnicode_GET_LENGTH(repunicode);
6300 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006301 if (requiredsize > ressize) {
6302 if (requiredsize<2*ressize)
6303 requiredsize = 2*ressize;
6304 if (_PyBytes_Resize(&res, requiredsize)) {
6305 Py_DECREF(repunicode);
6306 goto onError;
6307 }
6308 str = PyBytes_AS_STRING(res) + respos;
6309 ressize = requiredsize;
6310 }
6311 /* check if there is anything unencodable in the replacement
6312 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006313 for (i = 0; repsize-->0; ++i, ++str) {
6314 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006316 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006317 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006318 Py_DECREF(repunicode);
6319 goto onError;
6320 }
6321 *str = (char)c;
6322 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006323 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006324 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006325 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006326 }
6327 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006328 /* Resize if we allocated to much */
6329 size = str - PyBytes_AS_STRING(res);
6330 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006331 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006332 if (_PyBytes_Resize(&res, size) < 0)
6333 goto onError;
6334 }
6335
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006336 Py_XDECREF(errorHandler);
6337 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006338 return res;
6339
6340 onError:
6341 Py_XDECREF(res);
6342 Py_XDECREF(errorHandler);
6343 Py_XDECREF(exc);
6344 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345}
6346
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006347/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006348PyObject *
6349PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006350 Py_ssize_t size,
6351 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006353 PyObject *result;
6354 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6355 if (unicode == NULL)
6356 return NULL;
6357 result = unicode_encode_ucs1(unicode, errors, 256);
6358 Py_DECREF(unicode);
6359 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360}
6361
Alexander Belopolsky40018472011-02-26 01:02:56 +00006362PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006363_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364{
6365 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 PyErr_BadArgument();
6367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006369 if (PyUnicode_READY(unicode) == -1)
6370 return NULL;
6371 /* Fast path: if it is a one-byte string, construct
6372 bytes object directly. */
6373 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6374 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6375 PyUnicode_GET_LENGTH(unicode));
6376 /* Non-Latin-1 characters present. Defer to above function to
6377 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006378 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006379}
6380
6381PyObject*
6382PyUnicode_AsLatin1String(PyObject *unicode)
6383{
6384 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385}
6386
6387/* --- 7-bit ASCII Codec -------------------------------------------------- */
6388
Alexander Belopolsky40018472011-02-26 01:02:56 +00006389PyObject *
6390PyUnicode_DecodeASCII(const char *s,
6391 Py_ssize_t size,
6392 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006394 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006395 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006396 int kind;
6397 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006398 Py_ssize_t startinpos;
6399 Py_ssize_t endinpos;
6400 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401 const char *e;
6402 PyObject *errorHandler = NULL;
6403 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006404
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006405 if (size == 0) {
6406 Py_INCREF(unicode_empty);
6407 return unicode_empty;
6408 }
6409
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006411 if (size == 1 && (unsigned char)s[0] < 128)
6412 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006413
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006414 unicode = PyUnicode_New(size, 127);
6415 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006417
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006419 data = PyUnicode_1BYTE_DATA(unicode);
6420 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6421 if (outpos == size)
6422 return unicode;
6423
6424 s += outpos;
6425 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006426 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 register unsigned char c = (unsigned char)*s;
6428 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006429 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 ++s;
6431 }
6432 else {
6433 startinpos = s-starts;
6434 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 if (unicode_decode_call_errorhandler(
6436 errors, &errorHandler,
6437 "ascii", "ordinal not in range(128)",
6438 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006439 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006440 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006441 kind = PyUnicode_KIND(unicode);
6442 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006445 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006446 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006447 Py_XDECREF(errorHandler);
6448 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006449 assert(_PyUnicode_CheckConsistency(unicode, 1));
6450 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006451
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006453 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006454 Py_XDECREF(errorHandler);
6455 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 return NULL;
6457}
6458
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006459/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006460PyObject *
6461PyUnicode_EncodeASCII(const Py_UNICODE *p,
6462 Py_ssize_t size,
6463 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006465 PyObject *result;
6466 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6467 if (unicode == NULL)
6468 return NULL;
6469 result = unicode_encode_ucs1(unicode, errors, 128);
6470 Py_DECREF(unicode);
6471 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472}
6473
Alexander Belopolsky40018472011-02-26 01:02:56 +00006474PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006475_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476{
6477 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 PyErr_BadArgument();
6479 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006481 if (PyUnicode_READY(unicode) == -1)
6482 return NULL;
6483 /* Fast path: if it is an ASCII-only string, construct bytes object
6484 directly. Else defer to above function to raise the exception. */
6485 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6486 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6487 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006488 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006489}
6490
6491PyObject *
6492PyUnicode_AsASCIIString(PyObject *unicode)
6493{
6494 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495}
6496
Victor Stinner99b95382011-07-04 14:23:54 +02006497#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006498
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006499/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006500
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006501#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006502#define NEED_RETRY
6503#endif
6504
Victor Stinner3a50e702011-10-18 21:21:00 +02006505#ifndef WC_ERR_INVALID_CHARS
6506# define WC_ERR_INVALID_CHARS 0x0080
6507#endif
6508
6509static char*
6510code_page_name(UINT code_page, PyObject **obj)
6511{
6512 *obj = NULL;
6513 if (code_page == CP_ACP)
6514 return "mbcs";
6515 if (code_page == CP_UTF7)
6516 return "CP_UTF7";
6517 if (code_page == CP_UTF8)
6518 return "CP_UTF8";
6519
6520 *obj = PyBytes_FromFormat("cp%u", code_page);
6521 if (*obj == NULL)
6522 return NULL;
6523 return PyBytes_AS_STRING(*obj);
6524}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006525
Alexander Belopolsky40018472011-02-26 01:02:56 +00006526static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006527is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006528{
6529 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006530 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006531
Victor Stinner3a50e702011-10-18 21:21:00 +02006532 if (!IsDBCSLeadByteEx(code_page, *curr))
6533 return 0;
6534
6535 prev = CharPrevExA(code_page, s, curr, 0);
6536 if (prev == curr)
6537 return 1;
6538 /* FIXME: This code is limited to "true" double-byte encodings,
6539 as it assumes an incomplete character consists of a single
6540 byte. */
6541 if (curr - prev == 2)
6542 return 1;
6543 if (!IsDBCSLeadByteEx(code_page, *prev))
6544 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006545 return 0;
6546}
6547
Victor Stinner3a50e702011-10-18 21:21:00 +02006548static DWORD
6549decode_code_page_flags(UINT code_page)
6550{
6551 if (code_page == CP_UTF7) {
6552 /* The CP_UTF7 decoder only supports flags=0 */
6553 return 0;
6554 }
6555 else
6556 return MB_ERR_INVALID_CHARS;
6557}
6558
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006559/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006560 * Decode a byte string from a Windows code page into unicode object in strict
6561 * mode.
6562 *
6563 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6564 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006565 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006566static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006567decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006568 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006569 const char *in,
6570 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006571{
Victor Stinner3a50e702011-10-18 21:21:00 +02006572 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006573 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006574 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006575
6576 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006577 assert(insize > 0);
6578 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6579 if (outsize <= 0)
6580 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006581
6582 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006584 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006585 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 if (*v == NULL)
6587 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006588 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006589 }
6590 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006592 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006593 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006595 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006596 }
6597
6598 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006599 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6600 if (outsize <= 0)
6601 goto error;
6602 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006603
Victor Stinner3a50e702011-10-18 21:21:00 +02006604error:
6605 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6606 return -2;
6607 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006608 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006609}
6610
Victor Stinner3a50e702011-10-18 21:21:00 +02006611/*
6612 * Decode a byte string from a code page into unicode object with an error
6613 * handler.
6614 *
6615 * Returns consumed size if succeed, or raise a WindowsError or
6616 * UnicodeDecodeError exception and returns -1 on error.
6617 */
6618static int
6619decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006620 PyObject **v,
6621 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006622 const char *errors)
6623{
6624 const char *startin = in;
6625 const char *endin = in + size;
6626 const DWORD flags = decode_code_page_flags(code_page);
6627 /* Ideally, we should get reason from FormatMessage. This is the Windows
6628 2000 English version of the message. */
6629 const char *reason = "No mapping for the Unicode character exists "
6630 "in the target code page.";
6631 /* each step cannot decode more than 1 character, but a character can be
6632 represented as a surrogate pair */
6633 wchar_t buffer[2], *startout, *out;
6634 int insize, outsize;
6635 PyObject *errorHandler = NULL;
6636 PyObject *exc = NULL;
6637 PyObject *encoding_obj = NULL;
6638 char *encoding;
6639 DWORD err;
6640 int ret = -1;
6641
6642 assert(size > 0);
6643
6644 encoding = code_page_name(code_page, &encoding_obj);
6645 if (encoding == NULL)
6646 return -1;
6647
6648 if (errors == NULL || strcmp(errors, "strict") == 0) {
6649 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6650 UnicodeDecodeError. */
6651 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6652 if (exc != NULL) {
6653 PyCodec_StrictErrors(exc);
6654 Py_CLEAR(exc);
6655 }
6656 goto error;
6657 }
6658
6659 if (*v == NULL) {
6660 /* Create unicode object */
6661 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6662 PyErr_NoMemory();
6663 goto error;
6664 }
Victor Stinnerab595942011-12-17 04:59:06 +01006665 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006666 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006667 if (*v == NULL)
6668 goto error;
6669 startout = PyUnicode_AS_UNICODE(*v);
6670 }
6671 else {
6672 /* Extend unicode object */
6673 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6674 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6675 PyErr_NoMemory();
6676 goto error;
6677 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006678 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006679 goto error;
6680 startout = PyUnicode_AS_UNICODE(*v) + n;
6681 }
6682
6683 /* Decode the byte string character per character */
6684 out = startout;
6685 while (in < endin)
6686 {
6687 /* Decode a character */
6688 insize = 1;
6689 do
6690 {
6691 outsize = MultiByteToWideChar(code_page, flags,
6692 in, insize,
6693 buffer, Py_ARRAY_LENGTH(buffer));
6694 if (outsize > 0)
6695 break;
6696 err = GetLastError();
6697 if (err != ERROR_NO_UNICODE_TRANSLATION
6698 && err != ERROR_INSUFFICIENT_BUFFER)
6699 {
6700 PyErr_SetFromWindowsErr(0);
6701 goto error;
6702 }
6703 insize++;
6704 }
6705 /* 4=maximum length of a UTF-8 sequence */
6706 while (insize <= 4 && (in + insize) <= endin);
6707
6708 if (outsize <= 0) {
6709 Py_ssize_t startinpos, endinpos, outpos;
6710
6711 startinpos = in - startin;
6712 endinpos = startinpos + 1;
6713 outpos = out - PyUnicode_AS_UNICODE(*v);
6714 if (unicode_decode_call_errorhandler(
6715 errors, &errorHandler,
6716 encoding, reason,
6717 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006718 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006719 {
6720 goto error;
6721 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006722 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006723 }
6724 else {
6725 in += insize;
6726 memcpy(out, buffer, outsize * sizeof(wchar_t));
6727 out += outsize;
6728 }
6729 }
6730
6731 /* write a NUL character at the end */
6732 *out = 0;
6733
6734 /* Extend unicode object */
6735 outsize = out - startout;
6736 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006737 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006738 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006739 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006740
6741error:
6742 Py_XDECREF(encoding_obj);
6743 Py_XDECREF(errorHandler);
6744 Py_XDECREF(exc);
6745 return ret;
6746}
6747
Victor Stinner3a50e702011-10-18 21:21:00 +02006748static PyObject *
6749decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006750 const char *s, Py_ssize_t size,
6751 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006752{
Victor Stinner76a31a62011-11-04 00:05:13 +01006753 PyObject *v = NULL;
6754 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006755
Victor Stinner3a50e702011-10-18 21:21:00 +02006756 if (code_page < 0) {
6757 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6758 return NULL;
6759 }
6760
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006761 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006763
Victor Stinner76a31a62011-11-04 00:05:13 +01006764 do
6765 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006766#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006767 if (size > INT_MAX) {
6768 chunk_size = INT_MAX;
6769 final = 0;
6770 done = 0;
6771 }
6772 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006773#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006774 {
6775 chunk_size = (int)size;
6776 final = (consumed == NULL);
6777 done = 1;
6778 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006779
Victor Stinner76a31a62011-11-04 00:05:13 +01006780 /* Skip trailing lead-byte unless 'final' is set */
6781 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6782 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006783
Victor Stinner76a31a62011-11-04 00:05:13 +01006784 if (chunk_size == 0 && done) {
6785 if (v != NULL)
6786 break;
6787 Py_INCREF(unicode_empty);
6788 return unicode_empty;
6789 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006790
Victor Stinner76a31a62011-11-04 00:05:13 +01006791
6792 converted = decode_code_page_strict(code_page, &v,
6793 s, chunk_size);
6794 if (converted == -2)
6795 converted = decode_code_page_errors(code_page, &v,
6796 s, chunk_size,
6797 errors);
6798 assert(converted != 0);
6799
6800 if (converted < 0) {
6801 Py_XDECREF(v);
6802 return NULL;
6803 }
6804
6805 if (consumed)
6806 *consumed += converted;
6807
6808 s += converted;
6809 size -= converted;
6810 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006811
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006812 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006813}
6814
Alexander Belopolsky40018472011-02-26 01:02:56 +00006815PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006816PyUnicode_DecodeCodePageStateful(int code_page,
6817 const char *s,
6818 Py_ssize_t size,
6819 const char *errors,
6820 Py_ssize_t *consumed)
6821{
6822 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6823}
6824
6825PyObject *
6826PyUnicode_DecodeMBCSStateful(const char *s,
6827 Py_ssize_t size,
6828 const char *errors,
6829 Py_ssize_t *consumed)
6830{
6831 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6832}
6833
6834PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006835PyUnicode_DecodeMBCS(const char *s,
6836 Py_ssize_t size,
6837 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006838{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006839 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6840}
6841
Victor Stinner3a50e702011-10-18 21:21:00 +02006842static DWORD
6843encode_code_page_flags(UINT code_page, const char *errors)
6844{
6845 if (code_page == CP_UTF8) {
6846 if (winver.dwMajorVersion >= 6)
6847 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6848 and later */
6849 return WC_ERR_INVALID_CHARS;
6850 else
6851 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6852 return 0;
6853 }
6854 else if (code_page == CP_UTF7) {
6855 /* CP_UTF7 only supports flags=0 */
6856 return 0;
6857 }
6858 else {
6859 if (errors != NULL && strcmp(errors, "replace") == 0)
6860 return 0;
6861 else
6862 return WC_NO_BEST_FIT_CHARS;
6863 }
6864}
6865
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006866/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006867 * Encode a Unicode string to a Windows code page into a byte string in strict
6868 * mode.
6869 *
6870 * Returns consumed characters if succeed, returns -2 on encode error, or raise
6871 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006872 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006873static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006874encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006875 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006876 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006877{
Victor Stinner554f3f02010-06-16 23:33:54 +00006878 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006879 BOOL *pusedDefaultChar = &usedDefaultChar;
6880 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006881 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006882 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006883 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006884 const DWORD flags = encode_code_page_flags(code_page, NULL);
6885 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006886 /* Create a substring so that we can get the UTF-16 representation
6887 of just the slice under consideration. */
6888 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006889
Martin v. Löwis3d325192011-11-04 18:23:06 +01006890 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006891
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006893 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006894 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006895 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006896
Victor Stinner2fc507f2011-11-04 20:06:39 +01006897 substring = PyUnicode_Substring(unicode, offset, offset+len);
6898 if (substring == NULL)
6899 return -1;
6900 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6901 if (p == NULL) {
6902 Py_DECREF(substring);
6903 return -1;
6904 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006905
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006906 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006907 outsize = WideCharToMultiByte(code_page, flags,
6908 p, size,
6909 NULL, 0,
6910 NULL, pusedDefaultChar);
6911 if (outsize <= 0)
6912 goto error;
6913 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006914 if (pusedDefaultChar && *pusedDefaultChar) {
6915 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006916 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006917 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006918
Victor Stinner3a50e702011-10-18 21:21:00 +02006919 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006921 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006922 if (*outbytes == NULL) {
6923 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006925 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006926 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006927 }
6928 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006930 const Py_ssize_t n = PyBytes_Size(*outbytes);
6931 if (outsize > PY_SSIZE_T_MAX - n) {
6932 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006933 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006935 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006936 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6937 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006938 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006939 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006940 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006941 }
6942
6943 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006944 outsize = WideCharToMultiByte(code_page, flags,
6945 p, size,
6946 out, outsize,
6947 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006948 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006949 if (outsize <= 0)
6950 goto error;
6951 if (pusedDefaultChar && *pusedDefaultChar)
6952 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006953 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006954
Victor Stinner3a50e702011-10-18 21:21:00 +02006955error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006956 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006957 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6958 return -2;
6959 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006960 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006961}
6962
Victor Stinner3a50e702011-10-18 21:21:00 +02006963/*
6964 * Encode a Unicode string to a Windows code page into a byte string using a
6965 * error handler.
6966 *
6967 * Returns consumed characters if succeed, or raise a WindowsError and returns
6968 * -1 on other error.
6969 */
6970static int
6971encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01006972 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006973 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006974{
Victor Stinner3a50e702011-10-18 21:21:00 +02006975 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006976 Py_ssize_t pos = unicode_offset;
6977 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006978 /* Ideally, we should get reason from FormatMessage. This is the Windows
6979 2000 English version of the message. */
6980 const char *reason = "invalid character";
6981 /* 4=maximum length of a UTF-8 sequence */
6982 char buffer[4];
6983 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
6984 Py_ssize_t outsize;
6985 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006986 PyObject *errorHandler = NULL;
6987 PyObject *exc = NULL;
6988 PyObject *encoding_obj = NULL;
6989 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01006990 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006991 PyObject *rep;
6992 int ret = -1;
6993
6994 assert(insize > 0);
6995
6996 encoding = code_page_name(code_page, &encoding_obj);
6997 if (encoding == NULL)
6998 return -1;
6999
7000 if (errors == NULL || strcmp(errors, "strict") == 0) {
7001 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7002 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007003 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007004 if (exc != NULL) {
7005 PyCodec_StrictErrors(exc);
7006 Py_DECREF(exc);
7007 }
7008 Py_XDECREF(encoding_obj);
7009 return -1;
7010 }
7011
7012 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7013 pusedDefaultChar = &usedDefaultChar;
7014 else
7015 pusedDefaultChar = NULL;
7016
7017 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7018 PyErr_NoMemory();
7019 goto error;
7020 }
7021 outsize = insize * Py_ARRAY_LENGTH(buffer);
7022
7023 if (*outbytes == NULL) {
7024 /* Create string object */
7025 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7026 if (*outbytes == NULL)
7027 goto error;
7028 out = PyBytes_AS_STRING(*outbytes);
7029 }
7030 else {
7031 /* Extend string object */
7032 Py_ssize_t n = PyBytes_Size(*outbytes);
7033 if (n > PY_SSIZE_T_MAX - outsize) {
7034 PyErr_NoMemory();
7035 goto error;
7036 }
7037 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7038 goto error;
7039 out = PyBytes_AS_STRING(*outbytes) + n;
7040 }
7041
7042 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007043 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007044 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007045 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7046 wchar_t chars[2];
7047 int charsize;
7048 if (ch < 0x10000) {
7049 chars[0] = (wchar_t)ch;
7050 charsize = 1;
7051 }
7052 else {
7053 ch -= 0x10000;
7054 chars[0] = 0xd800 + (ch >> 10);
7055 chars[1] = 0xdc00 + (ch & 0x3ff);
7056 charsize = 2;
7057 }
7058
Victor Stinner3a50e702011-10-18 21:21:00 +02007059 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007060 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007061 buffer, Py_ARRAY_LENGTH(buffer),
7062 NULL, pusedDefaultChar);
7063 if (outsize > 0) {
7064 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7065 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007066 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007067 memcpy(out, buffer, outsize);
7068 out += outsize;
7069 continue;
7070 }
7071 }
7072 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7073 PyErr_SetFromWindowsErr(0);
7074 goto error;
7075 }
7076
Victor Stinner3a50e702011-10-18 21:21:00 +02007077 rep = unicode_encode_call_errorhandler(
7078 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007079 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007080 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007081 if (rep == NULL)
7082 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007083 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007084
7085 if (PyBytes_Check(rep)) {
7086 outsize = PyBytes_GET_SIZE(rep);
7087 if (outsize != 1) {
7088 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7089 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7090 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7091 Py_DECREF(rep);
7092 goto error;
7093 }
7094 out = PyBytes_AS_STRING(*outbytes) + offset;
7095 }
7096 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7097 out += outsize;
7098 }
7099 else {
7100 Py_ssize_t i;
7101 enum PyUnicode_Kind kind;
7102 void *data;
7103
Benjamin Petersonbac79492012-01-14 13:34:47 -05007104 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007105 Py_DECREF(rep);
7106 goto error;
7107 }
7108
7109 outsize = PyUnicode_GET_LENGTH(rep);
7110 if (outsize != 1) {
7111 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7112 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7113 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7114 Py_DECREF(rep);
7115 goto error;
7116 }
7117 out = PyBytes_AS_STRING(*outbytes) + offset;
7118 }
7119 kind = PyUnicode_KIND(rep);
7120 data = PyUnicode_DATA(rep);
7121 for (i=0; i < outsize; i++) {
7122 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7123 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007124 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007125 encoding, unicode,
7126 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007127 "unable to encode error handler result to ASCII");
7128 Py_DECREF(rep);
7129 goto error;
7130 }
7131 *out = (unsigned char)ch;
7132 out++;
7133 }
7134 }
7135 Py_DECREF(rep);
7136 }
7137 /* write a NUL byte */
7138 *out = 0;
7139 outsize = out - PyBytes_AS_STRING(*outbytes);
7140 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7141 if (_PyBytes_Resize(outbytes, outsize) < 0)
7142 goto error;
7143 ret = 0;
7144
7145error:
7146 Py_XDECREF(encoding_obj);
7147 Py_XDECREF(errorHandler);
7148 Py_XDECREF(exc);
7149 return ret;
7150}
7151
Victor Stinner3a50e702011-10-18 21:21:00 +02007152static PyObject *
7153encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007154 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 const char *errors)
7156{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007157 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007158 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007159 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007160 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007161
Benjamin Petersonbac79492012-01-14 13:34:47 -05007162 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007163 return NULL;
7164 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007165
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 if (code_page < 0) {
7167 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7168 return NULL;
7169 }
7170
Martin v. Löwis3d325192011-11-04 18:23:06 +01007171 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007172 return PyBytes_FromStringAndSize(NULL, 0);
7173
Victor Stinner7581cef2011-11-03 22:32:33 +01007174 offset = 0;
7175 do
7176 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007177#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007178 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007179 chunks. */
7180 if (len > INT_MAX/2) {
7181 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007182 done = 0;
7183 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007184 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007185#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007186 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007187 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007188 done = 1;
7189 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007190
Victor Stinner76a31a62011-11-04 00:05:13 +01007191 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007192 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007193 errors);
7194 if (ret == -2)
7195 ret = encode_code_page_errors(code_page, &outbytes,
7196 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007197 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007198 if (ret < 0) {
7199 Py_XDECREF(outbytes);
7200 return NULL;
7201 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007202
Victor Stinner7581cef2011-11-03 22:32:33 +01007203 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007204 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007205 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007206
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 return outbytes;
7208}
7209
7210PyObject *
7211PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7212 Py_ssize_t size,
7213 const char *errors)
7214{
Victor Stinner7581cef2011-11-03 22:32:33 +01007215 PyObject *unicode, *res;
7216 unicode = PyUnicode_FromUnicode(p, size);
7217 if (unicode == NULL)
7218 return NULL;
7219 res = encode_code_page(CP_ACP, unicode, errors);
7220 Py_DECREF(unicode);
7221 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007222}
7223
7224PyObject *
7225PyUnicode_EncodeCodePage(int code_page,
7226 PyObject *unicode,
7227 const char *errors)
7228{
Victor Stinner7581cef2011-11-03 22:32:33 +01007229 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007230}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007231
Alexander Belopolsky40018472011-02-26 01:02:56 +00007232PyObject *
7233PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007234{
7235 if (!PyUnicode_Check(unicode)) {
7236 PyErr_BadArgument();
7237 return NULL;
7238 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007239 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007240}
7241
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007242#undef NEED_RETRY
7243
Victor Stinner99b95382011-07-04 14:23:54 +02007244#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007245
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246/* --- Character Mapping Codec -------------------------------------------- */
7247
Alexander Belopolsky40018472011-02-26 01:02:56 +00007248PyObject *
7249PyUnicode_DecodeCharmap(const char *s,
7250 Py_ssize_t size,
7251 PyObject *mapping,
7252 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007254 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007255 Py_ssize_t startinpos;
7256 Py_ssize_t endinpos;
7257 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007258 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007259 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007260 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007261 PyObject *errorHandler = NULL;
7262 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007263
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 /* Default to Latin-1 */
7265 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007268 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007272 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007273 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007274 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007275 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007276 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007277 enum PyUnicode_Kind mapkind;
7278 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007279 Py_UCS4 x;
7280
Benjamin Petersonbac79492012-01-14 13:34:47 -05007281 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007282 return NULL;
7283
7284 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007285 mapdata = PyUnicode_DATA(mapping);
7286 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007288 unsigned char ch;
7289 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7290 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7291 if (outkind == PyUnicode_1BYTE_KIND) {
7292 void *outdata = PyUnicode_DATA(v);
7293 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7294 while (s < e) {
7295 unsigned char ch = *s;
7296 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7297 if (x > maxchar)
7298 goto Error;
7299 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7300 ++s;
7301 }
7302 break;
7303 }
7304 else if (outkind == PyUnicode_2BYTE_KIND) {
7305 void *outdata = PyUnicode_DATA(v);
7306 while (s < e) {
7307 unsigned char ch = *s;
7308 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7309 if (x == 0xFFFE)
7310 goto Error;
7311 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7312 ++s;
7313 }
7314 break;
7315 }
7316 }
7317 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007320 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007321 else
7322 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007323Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007324 if (x == 0xfffe)
7325 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007326 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 startinpos = s-starts;
7328 endinpos = startinpos+1;
7329 if (unicode_decode_call_errorhandler(
7330 errors, &errorHandler,
7331 "charmap", "character maps to <undefined>",
7332 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007333 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 goto onError;
7335 }
7336 continue;
7337 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007338
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007339 if (unicode_putchar(&v, &outpos, x) < 0)
7340 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007342 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007343 }
7344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 while (s < e) {
7346 unsigned char ch = *s;
7347 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007348
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7350 w = PyLong_FromLong((long)ch);
7351 if (w == NULL)
7352 goto onError;
7353 x = PyObject_GetItem(mapping, w);
7354 Py_DECREF(w);
7355 if (x == NULL) {
7356 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7357 /* No mapping found means: mapping is undefined. */
7358 PyErr_Clear();
7359 x = Py_None;
7360 Py_INCREF(x);
7361 } else
7362 goto onError;
7363 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007364
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 /* Apply mapping */
7366 if (PyLong_Check(x)) {
7367 long value = PyLong_AS_LONG(x);
Antoine Pitroua1f76552012-09-23 20:00:04 +02007368 if (value < 0 || value > MAX_UNICODE) {
7369 PyErr_Format(PyExc_TypeError,
7370 "character mapping must be in range(0x%lx)",
7371 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007372 Py_DECREF(x);
7373 goto onError;
7374 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007375 if (unicode_putchar(&v, &outpos, value) < 0)
7376 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 }
7378 else if (x == Py_None) {
7379 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 startinpos = s-starts;
7381 endinpos = startinpos+1;
7382 if (unicode_decode_call_errorhandler(
7383 errors, &errorHandler,
7384 "charmap", "character maps to <undefined>",
7385 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007386 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 Py_DECREF(x);
7388 goto onError;
7389 }
7390 Py_DECREF(x);
7391 continue;
7392 }
7393 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007394 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007395
Benjamin Petersonbac79492012-01-14 13:34:47 -05007396 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007397 goto onError;
7398 targetsize = PyUnicode_GET_LENGTH(x);
7399
7400 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007402 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007403 PyUnicode_READ_CHAR(x, 0)) < 0)
7404 goto onError;
7405 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 else if (targetsize > 1) {
7407 /* 1-n mapping */
7408 if (targetsize > extrachars) {
7409 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 Py_ssize_t needed = (targetsize - extrachars) + \
7411 (targetsize << 2);
7412 extrachars += needed;
7413 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007414 if (unicode_resize(&v,
7415 PyUnicode_GET_LENGTH(v) + needed) < 0)
7416 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 Py_DECREF(x);
7418 goto onError;
7419 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007421 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007422 goto onError;
7423 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7424 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 extrachars -= targetsize;
7426 }
7427 /* 1-0 mapping: skip the character */
7428 }
7429 else {
7430 /* wrong return value */
7431 PyErr_SetString(PyExc_TypeError,
7432 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007433 Py_DECREF(x);
7434 goto onError;
7435 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 Py_DECREF(x);
7437 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007440 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007441 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007442 Py_XDECREF(errorHandler);
7443 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007444 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007445
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007447 Py_XDECREF(errorHandler);
7448 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 Py_XDECREF(v);
7450 return NULL;
7451}
7452
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007453/* Charmap encoding: the lookup table */
7454
Alexander Belopolsky40018472011-02-26 01:02:56 +00007455struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 PyObject_HEAD
7457 unsigned char level1[32];
7458 int count2, count3;
7459 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007460};
7461
7462static PyObject*
7463encoding_map_size(PyObject *obj, PyObject* args)
7464{
7465 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007466 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007468}
7469
7470static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007471 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 PyDoc_STR("Return the size (in bytes) of this object") },
7473 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007474};
7475
7476static void
7477encoding_map_dealloc(PyObject* o)
7478{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007479 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007480}
7481
7482static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007483 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 "EncodingMap", /*tp_name*/
7485 sizeof(struct encoding_map), /*tp_basicsize*/
7486 0, /*tp_itemsize*/
7487 /* methods */
7488 encoding_map_dealloc, /*tp_dealloc*/
7489 0, /*tp_print*/
7490 0, /*tp_getattr*/
7491 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007492 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 0, /*tp_repr*/
7494 0, /*tp_as_number*/
7495 0, /*tp_as_sequence*/
7496 0, /*tp_as_mapping*/
7497 0, /*tp_hash*/
7498 0, /*tp_call*/
7499 0, /*tp_str*/
7500 0, /*tp_getattro*/
7501 0, /*tp_setattro*/
7502 0, /*tp_as_buffer*/
7503 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7504 0, /*tp_doc*/
7505 0, /*tp_traverse*/
7506 0, /*tp_clear*/
7507 0, /*tp_richcompare*/
7508 0, /*tp_weaklistoffset*/
7509 0, /*tp_iter*/
7510 0, /*tp_iternext*/
7511 encoding_map_methods, /*tp_methods*/
7512 0, /*tp_members*/
7513 0, /*tp_getset*/
7514 0, /*tp_base*/
7515 0, /*tp_dict*/
7516 0, /*tp_descr_get*/
7517 0, /*tp_descr_set*/
7518 0, /*tp_dictoffset*/
7519 0, /*tp_init*/
7520 0, /*tp_alloc*/
7521 0, /*tp_new*/
7522 0, /*tp_free*/
7523 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007524};
7525
7526PyObject*
7527PyUnicode_BuildEncodingMap(PyObject* string)
7528{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007529 PyObject *result;
7530 struct encoding_map *mresult;
7531 int i;
7532 int need_dict = 0;
7533 unsigned char level1[32];
7534 unsigned char level2[512];
7535 unsigned char *mlevel1, *mlevel2, *mlevel3;
7536 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007537 int kind;
7538 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007539 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007540 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007541
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007542 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007543 PyErr_BadArgument();
7544 return NULL;
7545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007546 kind = PyUnicode_KIND(string);
7547 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007548 length = PyUnicode_GET_LENGTH(string);
7549 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007550 memset(level1, 0xFF, sizeof level1);
7551 memset(level2, 0xFF, sizeof level2);
7552
7553 /* If there isn't a one-to-one mapping of NULL to \0,
7554 or if there are non-BMP characters, we need to use
7555 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007556 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007557 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007558 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007559 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007560 ch = PyUnicode_READ(kind, data, i);
7561 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007562 need_dict = 1;
7563 break;
7564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007565 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007566 /* unmapped character */
7567 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007568 l1 = ch >> 11;
7569 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007570 if (level1[l1] == 0xFF)
7571 level1[l1] = count2++;
7572 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007573 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007574 }
7575
7576 if (count2 >= 0xFF || count3 >= 0xFF)
7577 need_dict = 1;
7578
7579 if (need_dict) {
7580 PyObject *result = PyDict_New();
7581 PyObject *key, *value;
7582 if (!result)
7583 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007584 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007585 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007586 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007587 if (!key || !value)
7588 goto failed1;
7589 if (PyDict_SetItem(result, key, value) == -1)
7590 goto failed1;
7591 Py_DECREF(key);
7592 Py_DECREF(value);
7593 }
7594 return result;
7595 failed1:
7596 Py_XDECREF(key);
7597 Py_XDECREF(value);
7598 Py_DECREF(result);
7599 return NULL;
7600 }
7601
7602 /* Create a three-level trie */
7603 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7604 16*count2 + 128*count3 - 1);
7605 if (!result)
7606 return PyErr_NoMemory();
7607 PyObject_Init(result, &EncodingMapType);
7608 mresult = (struct encoding_map*)result;
7609 mresult->count2 = count2;
7610 mresult->count3 = count3;
7611 mlevel1 = mresult->level1;
7612 mlevel2 = mresult->level23;
7613 mlevel3 = mresult->level23 + 16*count2;
7614 memcpy(mlevel1, level1, 32);
7615 memset(mlevel2, 0xFF, 16*count2);
7616 memset(mlevel3, 0, 128*count3);
7617 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007618 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007619 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007620 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7621 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007622 /* unmapped character */
7623 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007624 o1 = ch>>11;
7625 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007626 i2 = 16*mlevel1[o1] + o2;
7627 if (mlevel2[i2] == 0xFF)
7628 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007629 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007630 i3 = 128*mlevel2[i2] + o3;
7631 mlevel3[i3] = i;
7632 }
7633 return result;
7634}
7635
7636static int
Victor Stinner22168992011-11-20 17:09:18 +01007637encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007638{
7639 struct encoding_map *map = (struct encoding_map*)mapping;
7640 int l1 = c>>11;
7641 int l2 = (c>>7) & 0xF;
7642 int l3 = c & 0x7F;
7643 int i;
7644
Victor Stinner22168992011-11-20 17:09:18 +01007645 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007647 if (c == 0)
7648 return 0;
7649 /* level 1*/
7650 i = map->level1[l1];
7651 if (i == 0xFF) {
7652 return -1;
7653 }
7654 /* level 2*/
7655 i = map->level23[16*i+l2];
7656 if (i == 0xFF) {
7657 return -1;
7658 }
7659 /* level 3 */
7660 i = map->level23[16*map->count2 + 128*i + l3];
7661 if (i == 0) {
7662 return -1;
7663 }
7664 return i;
7665}
7666
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007667/* Lookup the character ch in the mapping. If the character
7668 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007669 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007670static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007671charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672{
Christian Heimes217cfd12007-12-02 14:31:20 +00007673 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007674 PyObject *x;
7675
7676 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007678 x = PyObject_GetItem(mapping, w);
7679 Py_DECREF(w);
7680 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7682 /* No mapping found means: mapping is undefined. */
7683 PyErr_Clear();
7684 x = Py_None;
7685 Py_INCREF(x);
7686 return x;
7687 } else
7688 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007690 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007691 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007692 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 long value = PyLong_AS_LONG(x);
7694 if (value < 0 || value > 255) {
7695 PyErr_SetString(PyExc_TypeError,
7696 "character mapping must be in range(256)");
7697 Py_DECREF(x);
7698 return NULL;
7699 }
7700 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007702 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007703 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 /* wrong return value */
7706 PyErr_Format(PyExc_TypeError,
7707 "character mapping must return integer, bytes or None, not %.400s",
7708 x->ob_type->tp_name);
7709 Py_DECREF(x);
7710 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 }
7712}
7713
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007714static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007715charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007716{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007717 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7718 /* exponentially overallocate to minimize reallocations */
7719 if (requiredsize < 2*outsize)
7720 requiredsize = 2*outsize;
7721 if (_PyBytes_Resize(outobj, requiredsize))
7722 return -1;
7723 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007724}
7725
Benjamin Peterson14339b62009-01-31 16:36:08 +00007726typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007728} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007729/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007730 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007731 space is available. Return a new reference to the object that
7732 was put in the output buffer, or Py_None, if the mapping was undefined
7733 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007734 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007735static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007736charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007737 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007738{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007739 PyObject *rep;
7740 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007741 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007742
Christian Heimes90aa7642007-12-19 02:45:37 +00007743 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007744 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007746 if (res == -1)
7747 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 if (outsize<requiredsize)
7749 if (charmapencode_resize(outobj, outpos, requiredsize))
7750 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007751 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 outstart[(*outpos)++] = (char)res;
7753 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007754 }
7755
7756 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007757 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007758 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007759 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 Py_DECREF(rep);
7761 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007762 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 if (PyLong_Check(rep)) {
7764 Py_ssize_t requiredsize = *outpos+1;
7765 if (outsize<requiredsize)
7766 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7767 Py_DECREF(rep);
7768 return enc_EXCEPTION;
7769 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007770 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007772 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 else {
7774 const char *repchars = PyBytes_AS_STRING(rep);
7775 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7776 Py_ssize_t requiredsize = *outpos+repsize;
7777 if (outsize<requiredsize)
7778 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7779 Py_DECREF(rep);
7780 return enc_EXCEPTION;
7781 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007782 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 memcpy(outstart + *outpos, repchars, repsize);
7784 *outpos += repsize;
7785 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007786 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007787 Py_DECREF(rep);
7788 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007789}
7790
7791/* handle an error in PyUnicode_EncodeCharmap
7792 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007793static int
7794charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007795 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007796 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007797 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007798 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007799{
7800 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007801 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007802 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007803 enum PyUnicode_Kind kind;
7804 void *data;
7805 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007806 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007807 Py_ssize_t collstartpos = *inpos;
7808 Py_ssize_t collendpos = *inpos+1;
7809 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007810 char *encoding = "charmap";
7811 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007812 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007813 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007814 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007815
Benjamin Petersonbac79492012-01-14 13:34:47 -05007816 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007817 return -1;
7818 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007819 /* find all unencodable characters */
7820 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007821 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007822 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007823 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007824 val = encoding_map_lookup(ch, mapping);
7825 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 break;
7827 ++collendpos;
7828 continue;
7829 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007830
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007831 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7832 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 if (rep==NULL)
7834 return -1;
7835 else if (rep!=Py_None) {
7836 Py_DECREF(rep);
7837 break;
7838 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007839 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007841 }
7842 /* cache callback name lookup
7843 * (if not done yet, i.e. it's the first error) */
7844 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 if ((errors==NULL) || (!strcmp(errors, "strict")))
7846 *known_errorHandler = 1;
7847 else if (!strcmp(errors, "replace"))
7848 *known_errorHandler = 2;
7849 else if (!strcmp(errors, "ignore"))
7850 *known_errorHandler = 3;
7851 else if (!strcmp(errors, "xmlcharrefreplace"))
7852 *known_errorHandler = 4;
7853 else
7854 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007855 }
7856 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007857 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007858 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859 return -1;
7860 case 2: /* replace */
7861 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 x = charmapencode_output('?', mapping, res, respos);
7863 if (x==enc_EXCEPTION) {
7864 return -1;
7865 }
7866 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007867 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 return -1;
7869 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007870 }
7871 /* fall through */
7872 case 3: /* ignore */
7873 *inpos = collendpos;
7874 break;
7875 case 4: /* xmlcharrefreplace */
7876 /* generate replacement (temporarily (mis)uses p) */
7877 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 char buffer[2+29+1+1];
7879 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007880 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 for (cp = buffer; *cp; ++cp) {
7882 x = charmapencode_output(*cp, mapping, res, respos);
7883 if (x==enc_EXCEPTION)
7884 return -1;
7885 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007886 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 return -1;
7888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007889 }
7890 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007891 *inpos = collendpos;
7892 break;
7893 default:
7894 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007895 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007897 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007899 if (PyBytes_Check(repunicode)) {
7900 /* Directly copy bytes result to output. */
7901 Py_ssize_t outsize = PyBytes_Size(*res);
7902 Py_ssize_t requiredsize;
7903 repsize = PyBytes_Size(repunicode);
7904 requiredsize = *respos + repsize;
7905 if (requiredsize > outsize)
7906 /* Make room for all additional bytes. */
7907 if (charmapencode_resize(res, respos, requiredsize)) {
7908 Py_DECREF(repunicode);
7909 return -1;
7910 }
7911 memcpy(PyBytes_AsString(*res) + *respos,
7912 PyBytes_AsString(repunicode), repsize);
7913 *respos += repsize;
7914 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007915 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007916 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007918 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007919 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007920 Py_DECREF(repunicode);
7921 return -1;
7922 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007923 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007924 data = PyUnicode_DATA(repunicode);
7925 kind = PyUnicode_KIND(repunicode);
7926 for (index = 0; index < repsize; index++) {
7927 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7928 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007930 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 return -1;
7932 }
7933 else if (x==enc_FAILED) {
7934 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007935 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 return -1;
7937 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007938 }
7939 *inpos = newpos;
7940 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007941 }
7942 return 0;
7943}
7944
Alexander Belopolsky40018472011-02-26 01:02:56 +00007945PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007946_PyUnicode_EncodeCharmap(PyObject *unicode,
7947 PyObject *mapping,
7948 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007950 /* output object */
7951 PyObject *res = NULL;
7952 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007953 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007954 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007955 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007956 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957 PyObject *errorHandler = NULL;
7958 PyObject *exc = NULL;
7959 /* the following variable is used for caching string comparisons
7960 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7961 * 3=ignore, 4=xmlcharrefreplace */
7962 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963
Benjamin Petersonbac79492012-01-14 13:34:47 -05007964 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007965 return NULL;
7966 size = PyUnicode_GET_LENGTH(unicode);
7967
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 /* Default to Latin-1 */
7969 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007970 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007972 /* allocate enough for a simple encoding without
7973 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007974 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007975 if (res == NULL)
7976 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007977 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007980 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007981 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007983 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 if (x==enc_EXCEPTION) /* error */
7985 goto onError;
7986 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007987 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 &exc,
7989 &known_errorHandler, &errorHandler, errors,
7990 &res, &respos)) {
7991 goto onError;
7992 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007993 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 else
7995 /* done with this character => adjust input position */
7996 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007999 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008000 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008001 if (_PyBytes_Resize(&res, respos) < 0)
8002 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008003
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008004 Py_XDECREF(exc);
8005 Py_XDECREF(errorHandler);
8006 return res;
8007
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008009 Py_XDECREF(res);
8010 Py_XDECREF(exc);
8011 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 return NULL;
8013}
8014
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008015/* Deprecated */
8016PyObject *
8017PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8018 Py_ssize_t size,
8019 PyObject *mapping,
8020 const char *errors)
8021{
8022 PyObject *result;
8023 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8024 if (unicode == NULL)
8025 return NULL;
8026 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8027 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008028 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008029}
8030
Alexander Belopolsky40018472011-02-26 01:02:56 +00008031PyObject *
8032PyUnicode_AsCharmapString(PyObject *unicode,
8033 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034{
8035 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 PyErr_BadArgument();
8037 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008039 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040}
8041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008042/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008043static void
8044make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008045 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008046 Py_ssize_t startpos, Py_ssize_t endpos,
8047 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008049 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008050 *exceptionObject = _PyUnicodeTranslateError_Create(
8051 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 }
8053 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8055 goto onError;
8056 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8057 goto onError;
8058 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8059 goto onError;
8060 return;
8061 onError:
8062 Py_DECREF(*exceptionObject);
8063 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 }
8065}
8066
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008067/* error handling callback helper:
8068 build arguments, call the callback and check the arguments,
8069 put the result into newpos and return the replacement string, which
8070 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008071static PyObject *
8072unicode_translate_call_errorhandler(const char *errors,
8073 PyObject **errorHandler,
8074 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008075 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008076 Py_ssize_t startpos, Py_ssize_t endpos,
8077 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008079 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008081 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082 PyObject *restuple;
8083 PyObject *resunicode;
8084
8085 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008087 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089 }
8090
8091 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008092 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008093 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095
8096 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008101 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 Py_DECREF(restuple);
8103 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 }
8105 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 &resunicode, &i_newpos)) {
8107 Py_DECREF(restuple);
8108 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008110 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008111 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008112 else
8113 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008114 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8116 Py_DECREF(restuple);
8117 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008118 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 Py_INCREF(resunicode);
8120 Py_DECREF(restuple);
8121 return resunicode;
8122}
8123
8124/* Lookup the character ch in the mapping and put the result in result,
8125 which must be decrefed by the caller.
8126 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008127static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008128charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129{
Christian Heimes217cfd12007-12-02 14:31:20 +00008130 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 PyObject *x;
8132
8133 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008134 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008135 x = PyObject_GetItem(mapping, w);
8136 Py_DECREF(w);
8137 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8139 /* No mapping found means: use 1:1 mapping. */
8140 PyErr_Clear();
8141 *result = NULL;
8142 return 0;
8143 } else
8144 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008145 }
8146 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 *result = x;
8148 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008149 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008150 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 long value = PyLong_AS_LONG(x);
8152 long max = PyUnicode_GetMax();
8153 if (value < 0 || value > max) {
8154 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008155 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 Py_DECREF(x);
8157 return -1;
8158 }
8159 *result = x;
8160 return 0;
8161 }
8162 else if (PyUnicode_Check(x)) {
8163 *result = x;
8164 return 0;
8165 }
8166 else {
8167 /* wrong return value */
8168 PyErr_SetString(PyExc_TypeError,
8169 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 Py_DECREF(x);
8171 return -1;
8172 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008173}
8174/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 if not reallocate and adjust various state variables.
8176 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008177static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008178charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008181 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008182 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008183 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 /* exponentially overallocate to minimize reallocations */
8185 if (requiredsize < 2 * oldsize)
8186 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008187 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8188 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008190 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008191 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008192 }
8193 return 0;
8194}
8195/* lookup the character, put the result in the output string and adjust
8196 various state variables. Return a new reference to the object that
8197 was put in the output buffer in *result, or Py_None, if the mapping was
8198 undefined (in which case no character was written).
8199 The called must decref result.
8200 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008201static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008202charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8203 PyObject *mapping, Py_UCS4 **output,
8204 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008205 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008206{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008207 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8208 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008210 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008212 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213 }
8214 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008216 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008218 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008219 }
8220 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008221 Py_ssize_t repsize;
8222 if (PyUnicode_READY(*res) == -1)
8223 return -1;
8224 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 if (repsize==1) {
8226 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008227 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 }
8229 else if (repsize!=0) {
8230 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008231 Py_ssize_t requiredsize = *opos +
8232 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008234 Py_ssize_t i;
8235 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008237 for(i = 0; i < repsize; i++)
8238 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240 }
8241 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 return 0;
8244}
8245
Alexander Belopolsky40018472011-02-26 01:02:56 +00008246PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008247_PyUnicode_TranslateCharmap(PyObject *input,
8248 PyObject *mapping,
8249 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008251 /* input object */
8252 char *idata;
8253 Py_ssize_t size, i;
8254 int kind;
8255 /* output buffer */
8256 Py_UCS4 *output = NULL;
8257 Py_ssize_t osize;
8258 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008260 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008261 char *reason = "character maps to <undefined>";
8262 PyObject *errorHandler = NULL;
8263 PyObject *exc = NULL;
8264 /* the following variable is used for caching string comparisons
8265 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8266 * 3=ignore, 4=xmlcharrefreplace */
8267 int known_errorHandler = -1;
8268
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 PyErr_BadArgument();
8271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274 if (PyUnicode_READY(input) == -1)
8275 return NULL;
8276 idata = (char*)PyUnicode_DATA(input);
8277 kind = PyUnicode_KIND(input);
8278 size = PyUnicode_GET_LENGTH(input);
8279 i = 0;
8280
8281 if (size == 0) {
8282 Py_INCREF(input);
8283 return input;
8284 }
8285
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286 /* allocate enough for a simple 1:1 translation without
8287 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008288 osize = size;
8289 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8290 opos = 0;
8291 if (output == NULL) {
8292 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008294 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008296 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 /* try to encode it */
8298 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008299 if (charmaptranslate_output(input, i, mapping,
8300 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 Py_XDECREF(x);
8302 goto onError;
8303 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008304 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008306 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 else { /* untranslatable character */
8308 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8309 Py_ssize_t repsize;
8310 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008311 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313 Py_ssize_t collstart = i;
8314 Py_ssize_t collend = i+1;
8315 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008318 while (collend < size) {
8319 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 goto onError;
8321 Py_XDECREF(x);
8322 if (x!=Py_None)
8323 break;
8324 ++collend;
8325 }
8326 /* cache callback name lookup
8327 * (if not done yet, i.e. it's the first error) */
8328 if (known_errorHandler==-1) {
8329 if ((errors==NULL) || (!strcmp(errors, "strict")))
8330 known_errorHandler = 1;
8331 else if (!strcmp(errors, "replace"))
8332 known_errorHandler = 2;
8333 else if (!strcmp(errors, "ignore"))
8334 known_errorHandler = 3;
8335 else if (!strcmp(errors, "xmlcharrefreplace"))
8336 known_errorHandler = 4;
8337 else
8338 known_errorHandler = 0;
8339 }
8340 switch (known_errorHandler) {
8341 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008342 make_translate_exception(&exc,
8343 input, collstart, collend, reason);
8344 if (exc != NULL)
8345 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008346 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 case 2: /* replace */
8348 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008349 for (coll = collstart; coll<collend; coll++)
8350 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 /* fall through */
8352 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 break;
8355 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 /* generate replacement (temporarily (mis)uses i) */
8357 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 char buffer[2+29+1+1];
8359 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8361 if (charmaptranslate_makespace(&output, &osize,
8362 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 goto onError;
8364 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 break;
8369 default:
8370 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008371 reason, input, &exc,
8372 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008373 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008375 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008376 Py_DECREF(repunicode);
8377 goto onError;
8378 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 repsize = PyUnicode_GET_LENGTH(repunicode);
8381 if (charmaptranslate_makespace(&output, &osize,
8382 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 Py_DECREF(repunicode);
8384 goto onError;
8385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 for (uni2 = 0; repsize-->0; ++uni2)
8387 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8388 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008390 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008391 }
8392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8394 if (!res)
8395 goto onError;
8396 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 Py_XDECREF(exc);
8398 Py_XDECREF(errorHandler);
8399 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 Py_XDECREF(exc);
8404 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 return NULL;
8406}
8407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008408/* Deprecated. Use PyUnicode_Translate instead. */
8409PyObject *
8410PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8411 Py_ssize_t size,
8412 PyObject *mapping,
8413 const char *errors)
8414{
Christian Heimes5f520f42012-09-11 14:03:25 +02008415 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008416 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8417 if (!unicode)
8418 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008419 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8420 Py_DECREF(unicode);
8421 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422}
8423
Alexander Belopolsky40018472011-02-26 01:02:56 +00008424PyObject *
8425PyUnicode_Translate(PyObject *str,
8426 PyObject *mapping,
8427 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428{
8429 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008430
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 str = PyUnicode_FromObject(str);
8432 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008433 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 Py_DECREF(str);
8436 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437}
Tim Petersced69f82003-09-16 20:30:58 +00008438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008439static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008440fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008441{
8442 /* No need to call PyUnicode_READY(self) because this function is only
8443 called as a callback from fixup() which does it already. */
8444 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8445 const int kind = PyUnicode_KIND(self);
8446 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008447 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008448 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 Py_ssize_t i;
8450
8451 for (i = 0; i < len; ++i) {
8452 ch = PyUnicode_READ(kind, data, i);
8453 fixed = 0;
8454 if (ch > 127) {
8455 if (Py_UNICODE_ISSPACE(ch))
8456 fixed = ' ';
8457 else {
8458 const int decimal = Py_UNICODE_TODECIMAL(ch);
8459 if (decimal >= 0)
8460 fixed = '0' + decimal;
8461 }
8462 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008463 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008464 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465 PyUnicode_WRITE(kind, data, i, fixed);
8466 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008467 else
8468 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 }
8471
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008472 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473}
8474
8475PyObject *
8476_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8477{
8478 if (!PyUnicode_Check(unicode)) {
8479 PyErr_BadInternalCall();
8480 return NULL;
8481 }
8482 if (PyUnicode_READY(unicode) == -1)
8483 return NULL;
8484 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8485 /* If the string is already ASCII, just return the same string */
8486 Py_INCREF(unicode);
8487 return unicode;
8488 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008489 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490}
8491
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008492PyObject *
8493PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8494 Py_ssize_t length)
8495{
Victor Stinnerf0124502011-11-21 23:12:56 +01008496 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008497 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008498 Py_UCS4 maxchar;
8499 enum PyUnicode_Kind kind;
8500 void *data;
8501
Victor Stinner99d7ad02012-02-22 13:37:39 +01008502 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008503 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008504 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008505 if (ch > 127) {
8506 int decimal = Py_UNICODE_TODECIMAL(ch);
8507 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008508 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008509 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008510 }
8511 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008512
8513 /* Copy to a new string */
8514 decimal = PyUnicode_New(length, maxchar);
8515 if (decimal == NULL)
8516 return decimal;
8517 kind = PyUnicode_KIND(decimal);
8518 data = PyUnicode_DATA(decimal);
8519 /* Iterate over code points */
8520 for (i = 0; i < length; i++) {
8521 Py_UNICODE ch = s[i];
8522 if (ch > 127) {
8523 int decimal = Py_UNICODE_TODECIMAL(ch);
8524 if (decimal >= 0)
8525 ch = '0' + decimal;
8526 }
8527 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008529 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008530}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008531/* --- Decimal Encoder ---------------------------------------------------- */
8532
Alexander Belopolsky40018472011-02-26 01:02:56 +00008533int
8534PyUnicode_EncodeDecimal(Py_UNICODE *s,
8535 Py_ssize_t length,
8536 char *output,
8537 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008538{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008539 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008540 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008541 enum PyUnicode_Kind kind;
8542 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008543
8544 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 PyErr_BadArgument();
8546 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008547 }
8548
Victor Stinner42bf7752011-11-21 22:52:58 +01008549 unicode = PyUnicode_FromUnicode(s, length);
8550 if (unicode == NULL)
8551 return -1;
8552
Benjamin Petersonbac79492012-01-14 13:34:47 -05008553 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008554 Py_DECREF(unicode);
8555 return -1;
8556 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008557 kind = PyUnicode_KIND(unicode);
8558 data = PyUnicode_DATA(unicode);
8559
Victor Stinnerb84d7232011-11-22 01:50:07 +01008560 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008561 PyObject *exc;
8562 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008564 Py_ssize_t startpos;
8565
8566 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008567
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008569 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008570 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008572 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 decimal = Py_UNICODE_TODECIMAL(ch);
8574 if (decimal >= 0) {
8575 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008576 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 continue;
8578 }
8579 if (0 < ch && ch < 256) {
8580 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008581 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 continue;
8583 }
Victor Stinner6345be92011-11-25 20:09:01 +01008584
Victor Stinner42bf7752011-11-21 22:52:58 +01008585 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008586 exc = NULL;
8587 raise_encode_exception(&exc, "decimal", unicode,
8588 startpos, startpos+1,
8589 "invalid decimal Unicode string");
8590 Py_XDECREF(exc);
8591 Py_DECREF(unicode);
8592 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008593 }
8594 /* 0-terminate the output string */
8595 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008596 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008597 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008598}
8599
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600/* --- Helpers ------------------------------------------------------------ */
8601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008603any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604 Py_ssize_t start,
8605 Py_ssize_t end)
8606{
8607 int kind1, kind2, kind;
8608 void *buf1, *buf2;
8609 Py_ssize_t len1, len2, result;
8610
8611 kind1 = PyUnicode_KIND(s1);
8612 kind2 = PyUnicode_KIND(s2);
8613 kind = kind1 > kind2 ? kind1 : kind2;
8614 buf1 = PyUnicode_DATA(s1);
8615 buf2 = PyUnicode_DATA(s2);
8616 if (kind1 != kind)
8617 buf1 = _PyUnicode_AsKind(s1, kind);
8618 if (!buf1)
8619 return -2;
8620 if (kind2 != kind)
8621 buf2 = _PyUnicode_AsKind(s2, kind);
8622 if (!buf2) {
8623 if (kind1 != kind) PyMem_Free(buf1);
8624 return -2;
8625 }
8626 len1 = PyUnicode_GET_LENGTH(s1);
8627 len2 = PyUnicode_GET_LENGTH(s2);
8628
Victor Stinner794d5672011-10-10 03:21:36 +02008629 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008630 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008631 case PyUnicode_1BYTE_KIND:
8632 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8633 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8634 else
8635 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8636 break;
8637 case PyUnicode_2BYTE_KIND:
8638 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8639 break;
8640 case PyUnicode_4BYTE_KIND:
8641 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8642 break;
8643 default:
8644 assert(0); result = -2;
8645 }
8646 }
8647 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008648 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008649 case PyUnicode_1BYTE_KIND:
8650 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8651 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8652 else
8653 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8654 break;
8655 case PyUnicode_2BYTE_KIND:
8656 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8657 break;
8658 case PyUnicode_4BYTE_KIND:
8659 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8660 break;
8661 default:
8662 assert(0); result = -2;
8663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 }
8665
8666 if (kind1 != kind)
8667 PyMem_Free(buf1);
8668 if (kind2 != kind)
8669 PyMem_Free(buf2);
8670
8671 return result;
8672}
8673
8674Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008675_PyUnicode_InsertThousandsGrouping(
8676 PyObject *unicode, Py_ssize_t index,
8677 Py_ssize_t n_buffer,
8678 void *digits, Py_ssize_t n_digits,
8679 Py_ssize_t min_width,
8680 const char *grouping, PyObject *thousands_sep,
8681 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682{
Victor Stinner41a863c2012-02-24 00:37:51 +01008683 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008684 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008685 Py_ssize_t thousands_sep_len;
8686 Py_ssize_t len;
8687
8688 if (unicode != NULL) {
8689 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008690 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008691 }
8692 else {
8693 kind = PyUnicode_1BYTE_KIND;
8694 data = NULL;
8695 }
8696 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8697 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8698 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8699 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008700 if (thousands_sep_kind < kind) {
8701 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8702 if (!thousands_sep_data)
8703 return -1;
8704 }
8705 else {
8706 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8707 if (!data)
8708 return -1;
8709 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008710 }
8711
Benjamin Petersonead6b532011-12-20 17:23:42 -06008712 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008714 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008715 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008716 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008717 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008718 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008719 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008720 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008721 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008722 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008723 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008724 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008726 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008727 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008728 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008729 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008730 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008732 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008733 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008734 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008735 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008736 break;
8737 default:
8738 assert(0);
8739 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008741 if (unicode != NULL && thousands_sep_kind != kind) {
8742 if (thousands_sep_kind < kind)
8743 PyMem_Free(thousands_sep_data);
8744 else
8745 PyMem_Free(data);
8746 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008747 if (unicode == NULL) {
8748 *maxchar = 127;
8749 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008750 *maxchar = MAX_MAXCHAR(*maxchar,
8751 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008752 }
8753 }
8754 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755}
8756
8757
Thomas Wouters477c8d52006-05-27 19:21:47 +00008758/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008759#define ADJUST_INDICES(start, end, len) \
8760 if (end > len) \
8761 end = len; \
8762 else if (end < 0) { \
8763 end += len; \
8764 if (end < 0) \
8765 end = 0; \
8766 } \
8767 if (start < 0) { \
8768 start += len; \
8769 if (start < 0) \
8770 start = 0; \
8771 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008772
Alexander Belopolsky40018472011-02-26 01:02:56 +00008773Py_ssize_t
8774PyUnicode_Count(PyObject *str,
8775 PyObject *substr,
8776 Py_ssize_t start,
8777 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008779 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008780 PyObject* str_obj;
8781 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 int kind1, kind2, kind;
8783 void *buf1 = NULL, *buf2 = NULL;
8784 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008785
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008786 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008787 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008789 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008790 if (!sub_obj) {
8791 Py_DECREF(str_obj);
8792 return -1;
8793 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008794 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008795 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 Py_DECREF(str_obj);
8797 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 }
Tim Petersced69f82003-09-16 20:30:58 +00008799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 kind1 = PyUnicode_KIND(str_obj);
8801 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008802 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008805 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008806 if (kind2 > kind) {
8807 Py_DECREF(sub_obj);
8808 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008809 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008810 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008811 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 if (!buf2)
8814 goto onError;
8815 len1 = PyUnicode_GET_LENGTH(str_obj);
8816 len2 = PyUnicode_GET_LENGTH(sub_obj);
8817
8818 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008819 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008821 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8822 result = asciilib_count(
8823 ((Py_UCS1*)buf1) + start, end - start,
8824 buf2, len2, PY_SSIZE_T_MAX
8825 );
8826 else
8827 result = ucs1lib_count(
8828 ((Py_UCS1*)buf1) + start, end - start,
8829 buf2, len2, PY_SSIZE_T_MAX
8830 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 break;
8832 case PyUnicode_2BYTE_KIND:
8833 result = ucs2lib_count(
8834 ((Py_UCS2*)buf1) + start, end - start,
8835 buf2, len2, PY_SSIZE_T_MAX
8836 );
8837 break;
8838 case PyUnicode_4BYTE_KIND:
8839 result = ucs4lib_count(
8840 ((Py_UCS4*)buf1) + start, end - start,
8841 buf2, len2, PY_SSIZE_T_MAX
8842 );
8843 break;
8844 default:
8845 assert(0); result = 0;
8846 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008847
8848 Py_DECREF(sub_obj);
8849 Py_DECREF(str_obj);
8850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 if (kind2 != kind)
8852 PyMem_Free(buf2);
8853
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 onError:
8856 Py_DECREF(sub_obj);
8857 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 if (kind2 != kind && buf2)
8859 PyMem_Free(buf2);
8860 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861}
8862
Alexander Belopolsky40018472011-02-26 01:02:56 +00008863Py_ssize_t
8864PyUnicode_Find(PyObject *str,
8865 PyObject *sub,
8866 Py_ssize_t start,
8867 Py_ssize_t end,
8868 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008870 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008871
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008873 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008875 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008876 if (!sub) {
8877 Py_DECREF(str);
8878 return -2;
8879 }
8880 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8881 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 Py_DECREF(str);
8883 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884 }
Tim Petersced69f82003-09-16 20:30:58 +00008885
Victor Stinner794d5672011-10-10 03:21:36 +02008886 result = any_find_slice(direction,
8887 str, sub, start, end
8888 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008889
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008891 Py_DECREF(sub);
8892
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893 return result;
8894}
8895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896Py_ssize_t
8897PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8898 Py_ssize_t start, Py_ssize_t end,
8899 int direction)
8900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008902 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 if (PyUnicode_READY(str) == -1)
8904 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008905 if (start < 0 || end < 0) {
8906 PyErr_SetString(PyExc_IndexError, "string index out of range");
8907 return -2;
8908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909 if (end > PyUnicode_GET_LENGTH(str))
8910 end = PyUnicode_GET_LENGTH(str);
8911 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008912 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8913 kind, end-start, ch, direction);
8914 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008916 else
8917 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918}
8919
Alexander Belopolsky40018472011-02-26 01:02:56 +00008920static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008921tailmatch(PyObject *self,
8922 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008923 Py_ssize_t start,
8924 Py_ssize_t end,
8925 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 int kind_self;
8928 int kind_sub;
8929 void *data_self;
8930 void *data_sub;
8931 Py_ssize_t offset;
8932 Py_ssize_t i;
8933 Py_ssize_t end_sub;
8934
8935 if (PyUnicode_READY(self) == -1 ||
8936 PyUnicode_READY(substring) == -1)
8937 return 0;
8938
8939 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940 return 1;
8941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8943 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 kind_self = PyUnicode_KIND(self);
8948 data_self = PyUnicode_DATA(self);
8949 kind_sub = PyUnicode_KIND(substring);
8950 data_sub = PyUnicode_DATA(substring);
8951 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8952
8953 if (direction > 0)
8954 offset = end;
8955 else
8956 offset = start;
8957
8958 if (PyUnicode_READ(kind_self, data_self, offset) ==
8959 PyUnicode_READ(kind_sub, data_sub, 0) &&
8960 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8961 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8962 /* If both are of the same kind, memcmp is sufficient */
8963 if (kind_self == kind_sub) {
8964 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008965 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 data_sub,
8967 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008968 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 }
8970 /* otherwise we have to compare each character by first accesing it */
8971 else {
8972 /* We do not need to compare 0 and len(substring)-1 because
8973 the if statement above ensured already that they are equal
8974 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02008975 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 for (i = 1; i < end_sub; ++i) {
8977 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8978 PyUnicode_READ(kind_sub, data_sub, i))
8979 return 0;
8980 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008981 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983 }
8984
8985 return 0;
8986}
8987
Alexander Belopolsky40018472011-02-26 01:02:56 +00008988Py_ssize_t
8989PyUnicode_Tailmatch(PyObject *str,
8990 PyObject *substr,
8991 Py_ssize_t start,
8992 Py_ssize_t end,
8993 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008995 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008996
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997 str = PyUnicode_FromObject(str);
8998 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000 substr = PyUnicode_FromObject(substr);
9001 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009002 Py_DECREF(str);
9003 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004 }
Tim Petersced69f82003-09-16 20:30:58 +00009005
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009006 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 Py_DECREF(str);
9009 Py_DECREF(substr);
9010 return result;
9011}
9012
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013/* Apply fixfct filter to the Unicode object self and return a
9014 reference to the modified object */
9015
Alexander Belopolsky40018472011-02-26 01:02:56 +00009016static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009017fixup(PyObject *self,
9018 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009020 PyObject *u;
9021 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009022 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009024 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009026 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009027 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009029 /* fix functions return the new maximum character in a string,
9030 if the kind of the resulting unicode object does not change,
9031 everything is fine. Otherwise we need to change the string kind
9032 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009033 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009034
9035 if (maxchar_new == 0) {
9036 /* no changes */;
9037 if (PyUnicode_CheckExact(self)) {
9038 Py_DECREF(u);
9039 Py_INCREF(self);
9040 return self;
9041 }
9042 else
9043 return u;
9044 }
9045
Victor Stinnere6abb482012-05-02 01:15:40 +02009046 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047
Victor Stinnereaab6042011-12-11 22:22:39 +01009048 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009050
9051 /* In case the maximum character changed, we need to
9052 convert the string to the new category. */
9053 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9054 if (v == NULL) {
9055 Py_DECREF(u);
9056 return NULL;
9057 }
9058 if (maxchar_new > maxchar_old) {
9059 /* If the maxchar increased so that the kind changed, not all
9060 characters are representable anymore and we need to fix the
9061 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009062 _PyUnicode_FastCopyCharacters(v, 0,
9063 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009064 maxchar_old = fixfct(v);
9065 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 }
9067 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009068 _PyUnicode_FastCopyCharacters(v, 0,
9069 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009071 Py_DECREF(u);
9072 assert(_PyUnicode_CheckConsistency(v, 1));
9073 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074}
9075
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009076static PyObject *
9077ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009078{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009079 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9080 char *resdata, *data = PyUnicode_DATA(self);
9081 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009082
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009083 res = PyUnicode_New(len, 127);
9084 if (res == NULL)
9085 return NULL;
9086 resdata = PyUnicode_DATA(res);
9087 if (lower)
9088 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009090 _Py_bytes_upper(resdata, data, len);
9091 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092}
9093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009095handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009097 Py_ssize_t j;
9098 int final_sigma;
9099 Py_UCS4 c;
9100 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009101
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009102 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9103
9104 where ! is a negation and \p{xxx} is a character with property xxx.
9105 */
9106 for (j = i - 1; j >= 0; j--) {
9107 c = PyUnicode_READ(kind, data, j);
9108 if (!_PyUnicode_IsCaseIgnorable(c))
9109 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009111 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9112 if (final_sigma) {
9113 for (j = i + 1; j < length; j++) {
9114 c = PyUnicode_READ(kind, data, j);
9115 if (!_PyUnicode_IsCaseIgnorable(c))
9116 break;
9117 }
9118 final_sigma = j == length || !_PyUnicode_IsCased(c);
9119 }
9120 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121}
9122
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009123static int
9124lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9125 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009127 /* Obscure special case. */
9128 if (c == 0x3A3) {
9129 mapped[0] = handle_capital_sigma(kind, data, length, i);
9130 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009132 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133}
9134
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009135static Py_ssize_t
9136do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009138 Py_ssize_t i, k = 0;
9139 int n_res, j;
9140 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009141
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009142 c = PyUnicode_READ(kind, data, 0);
9143 n_res = _PyUnicode_ToUpperFull(c, mapped);
9144 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009145 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009146 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009148 for (i = 1; i < length; i++) {
9149 c = PyUnicode_READ(kind, data, i);
9150 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9151 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009152 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009153 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009154 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009155 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009156 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157}
9158
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009159static Py_ssize_t
9160do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9161 Py_ssize_t i, k = 0;
9162
9163 for (i = 0; i < length; i++) {
9164 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9165 int n_res, j;
9166 if (Py_UNICODE_ISUPPER(c)) {
9167 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9168 }
9169 else if (Py_UNICODE_ISLOWER(c)) {
9170 n_res = _PyUnicode_ToUpperFull(c, mapped);
9171 }
9172 else {
9173 n_res = 1;
9174 mapped[0] = c;
9175 }
9176 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009177 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009178 res[k++] = mapped[j];
9179 }
9180 }
9181 return k;
9182}
9183
9184static Py_ssize_t
9185do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9186 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009188 Py_ssize_t i, k = 0;
9189
9190 for (i = 0; i < length; i++) {
9191 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9192 int n_res, j;
9193 if (lower)
9194 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9195 else
9196 n_res = _PyUnicode_ToUpperFull(c, mapped);
9197 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009198 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009199 res[k++] = mapped[j];
9200 }
9201 }
9202 return k;
9203}
9204
9205static Py_ssize_t
9206do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9207{
9208 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9209}
9210
9211static Py_ssize_t
9212do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9213{
9214 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9215}
9216
Benjamin Petersone51757f2012-01-12 21:10:29 -05009217static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009218do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9219{
9220 Py_ssize_t i, k = 0;
9221
9222 for (i = 0; i < length; i++) {
9223 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9224 Py_UCS4 mapped[3];
9225 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9226 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009227 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009228 res[k++] = mapped[j];
9229 }
9230 }
9231 return k;
9232}
9233
9234static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009235do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9236{
9237 Py_ssize_t i, k = 0;
9238 int previous_is_cased;
9239
9240 previous_is_cased = 0;
9241 for (i = 0; i < length; i++) {
9242 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9243 Py_UCS4 mapped[3];
9244 int n_res, j;
9245
9246 if (previous_is_cased)
9247 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9248 else
9249 n_res = _PyUnicode_ToTitleFull(c, mapped);
9250
9251 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009252 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009253 res[k++] = mapped[j];
9254 }
9255
9256 previous_is_cased = _PyUnicode_IsCased(c);
9257 }
9258 return k;
9259}
9260
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009261static PyObject *
9262case_operation(PyObject *self,
9263 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9264{
9265 PyObject *res = NULL;
9266 Py_ssize_t length, newlength = 0;
9267 int kind, outkind;
9268 void *data, *outdata;
9269 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9270
Benjamin Petersoneea48462012-01-16 14:28:50 -05009271 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009272
9273 kind = PyUnicode_KIND(self);
9274 data = PyUnicode_DATA(self);
9275 length = PyUnicode_GET_LENGTH(self);
9276 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9277 if (tmp == NULL)
9278 return PyErr_NoMemory();
9279 newlength = perform(kind, data, length, tmp, &maxchar);
9280 res = PyUnicode_New(newlength, maxchar);
9281 if (res == NULL)
9282 goto leave;
9283 tmpend = tmp + newlength;
9284 outdata = PyUnicode_DATA(res);
9285 outkind = PyUnicode_KIND(res);
9286 switch (outkind) {
9287 case PyUnicode_1BYTE_KIND:
9288 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9289 break;
9290 case PyUnicode_2BYTE_KIND:
9291 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9292 break;
9293 case PyUnicode_4BYTE_KIND:
9294 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9295 break;
9296 default:
9297 assert(0);
9298 break;
9299 }
9300 leave:
9301 PyMem_FREE(tmp);
9302 return res;
9303}
9304
Tim Peters8ce9f162004-08-27 01:49:32 +00009305PyObject *
9306PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009307{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009309 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009311 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009312 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9313 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009314 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009316 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009318 int use_memcpy;
9319 unsigned char *res_data = NULL, *sep_data = NULL;
9320 PyObject *last_obj;
9321 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322
Tim Peters05eba1f2004-08-27 21:32:02 +00009323 fseq = PySequence_Fast(seq, "");
9324 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009325 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009326 }
9327
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009328 /* NOTE: the following code can't call back into Python code,
9329 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009330 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009331
Tim Peters05eba1f2004-08-27 21:32:02 +00009332 seqlen = PySequence_Fast_GET_SIZE(fseq);
9333 /* If empty sequence, return u"". */
9334 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009335 Py_DECREF(fseq);
9336 Py_INCREF(unicode_empty);
9337 res = unicode_empty;
9338 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009339 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009340
Tim Peters05eba1f2004-08-27 21:32:02 +00009341 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009342 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009343 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009344 if (seqlen == 1) {
9345 if (PyUnicode_CheckExact(items[0])) {
9346 res = items[0];
9347 Py_INCREF(res);
9348 Py_DECREF(fseq);
9349 return res;
9350 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009351 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009352 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009353 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009354 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009355 /* Set up sep and seplen */
9356 if (separator == NULL) {
9357 /* fall back to a blank space separator */
9358 sep = PyUnicode_FromOrdinal(' ');
9359 if (!sep)
9360 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009361 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009362 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009363 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009364 else {
9365 if (!PyUnicode_Check(separator)) {
9366 PyErr_Format(PyExc_TypeError,
9367 "separator: expected str instance,"
9368 " %.80s found",
9369 Py_TYPE(separator)->tp_name);
9370 goto onError;
9371 }
9372 if (PyUnicode_READY(separator))
9373 goto onError;
9374 sep = separator;
9375 seplen = PyUnicode_GET_LENGTH(separator);
9376 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9377 /* inc refcount to keep this code path symmetric with the
9378 above case of a blank separator */
9379 Py_INCREF(sep);
9380 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009381 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009382 }
9383
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009384 /* There are at least two things to join, or else we have a subclass
9385 * of str in the sequence.
9386 * Do a pre-pass to figure out the total amount of space we'll
9387 * need (sz), and see whether all argument are strings.
9388 */
9389 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009390#ifdef Py_DEBUG
9391 use_memcpy = 0;
9392#else
9393 use_memcpy = 1;
9394#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009395 for (i = 0; i < seqlen; i++) {
9396 const Py_ssize_t old_sz = sz;
9397 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009398 if (!PyUnicode_Check(item)) {
9399 PyErr_Format(PyExc_TypeError,
9400 "sequence item %zd: expected str instance,"
9401 " %.80s found",
9402 i, Py_TYPE(item)->tp_name);
9403 goto onError;
9404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405 if (PyUnicode_READY(item) == -1)
9406 goto onError;
9407 sz += PyUnicode_GET_LENGTH(item);
9408 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009409 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009410 if (i != 0)
9411 sz += seplen;
9412 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9413 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009414 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009415 goto onError;
9416 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009417 if (use_memcpy && last_obj != NULL) {
9418 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9419 use_memcpy = 0;
9420 }
9421 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009422 }
Tim Petersced69f82003-09-16 20:30:58 +00009423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009425 if (res == NULL)
9426 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009427
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009428 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009429#ifdef Py_DEBUG
9430 use_memcpy = 0;
9431#else
9432 if (use_memcpy) {
9433 res_data = PyUnicode_1BYTE_DATA(res);
9434 kind = PyUnicode_KIND(res);
9435 if (seplen != 0)
9436 sep_data = PyUnicode_1BYTE_DATA(sep);
9437 }
9438#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009440 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009441 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009442 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009443 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009444 if (use_memcpy) {
9445 Py_MEMCPY(res_data,
9446 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009447 kind * seplen);
9448 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009449 }
9450 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009451 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009452 res_offset += seplen;
9453 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009454 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009455 itemlen = PyUnicode_GET_LENGTH(item);
9456 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009457 if (use_memcpy) {
9458 Py_MEMCPY(res_data,
9459 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009460 kind * itemlen);
9461 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009462 }
9463 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009464 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009465 res_offset += itemlen;
9466 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009467 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009468 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009469 if (use_memcpy)
9470 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009471 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009472 else
9473 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009474
Tim Peters05eba1f2004-08-27 21:32:02 +00009475 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009477 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479
Benjamin Peterson29060642009-01-31 22:14:21 +00009480 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009481 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009483 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484 return NULL;
9485}
9486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487#define FILL(kind, data, value, start, length) \
9488 do { \
9489 Py_ssize_t i_ = 0; \
9490 assert(kind != PyUnicode_WCHAR_KIND); \
9491 switch ((kind)) { \
9492 case PyUnicode_1BYTE_KIND: { \
9493 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009494 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 break; \
9496 } \
9497 case PyUnicode_2BYTE_KIND: { \
9498 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9499 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9500 break; \
9501 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009502 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9504 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9505 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009506 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 } \
9508 } \
9509 } while (0)
9510
Victor Stinnerd3f08822012-05-29 12:57:52 +02009511void
9512_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9513 Py_UCS4 fill_char)
9514{
9515 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9516 const void *data = PyUnicode_DATA(unicode);
9517 assert(PyUnicode_IS_READY(unicode));
9518 assert(unicode_modifiable(unicode));
9519 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9520 assert(start >= 0);
9521 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9522 FILL(kind, data, fill_char, start, length);
9523}
9524
Victor Stinner3fe55312012-01-04 00:33:50 +01009525Py_ssize_t
9526PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9527 Py_UCS4 fill_char)
9528{
9529 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009530
9531 if (!PyUnicode_Check(unicode)) {
9532 PyErr_BadInternalCall();
9533 return -1;
9534 }
9535 if (PyUnicode_READY(unicode) == -1)
9536 return -1;
9537 if (unicode_check_modifiable(unicode))
9538 return -1;
9539
Victor Stinnerd3f08822012-05-29 12:57:52 +02009540 if (start < 0) {
9541 PyErr_SetString(PyExc_IndexError, "string index out of range");
9542 return -1;
9543 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009544 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9545 PyErr_SetString(PyExc_ValueError,
9546 "fill character is bigger than "
9547 "the string maximum character");
9548 return -1;
9549 }
9550
9551 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9552 length = Py_MIN(maxlen, length);
9553 if (length <= 0)
9554 return 0;
9555
Victor Stinnerd3f08822012-05-29 12:57:52 +02009556 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009557 return length;
9558}
9559
Victor Stinner9310abb2011-10-05 00:59:23 +02009560static PyObject *
9561pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009562 Py_ssize_t left,
9563 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 PyObject *u;
9567 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009568 int kind;
9569 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570
9571 if (left < 0)
9572 left = 0;
9573 if (right < 0)
9574 right = 0;
9575
Victor Stinnerc4b49542011-12-11 22:44:26 +01009576 if (left == 0 && right == 0)
9577 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9580 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009581 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9582 return NULL;
9583 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009585 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009587 if (!u)
9588 return NULL;
9589
9590 kind = PyUnicode_KIND(u);
9591 data = PyUnicode_DATA(u);
9592 if (left)
9593 FILL(kind, data, fill, 0, left);
9594 if (right)
9595 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009596 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009597 assert(_PyUnicode_CheckConsistency(u, 1));
9598 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599}
9600
Alexander Belopolsky40018472011-02-26 01:02:56 +00009601PyObject *
9602PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605
9606 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009607 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009608 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009609 if (PyUnicode_READY(string) == -1) {
9610 Py_DECREF(string);
9611 return NULL;
9612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613
Benjamin Petersonead6b532011-12-20 17:23:42 -06009614 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009616 if (PyUnicode_IS_ASCII(string))
9617 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009618 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009619 PyUnicode_GET_LENGTH(string), keepends);
9620 else
9621 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009622 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009623 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 break;
9625 case PyUnicode_2BYTE_KIND:
9626 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009627 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 PyUnicode_GET_LENGTH(string), keepends);
9629 break;
9630 case PyUnicode_4BYTE_KIND:
9631 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009632 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 PyUnicode_GET_LENGTH(string), keepends);
9634 break;
9635 default:
9636 assert(0);
9637 list = 0;
9638 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639 Py_DECREF(string);
9640 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641}
9642
Alexander Belopolsky40018472011-02-26 01:02:56 +00009643static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009644split(PyObject *self,
9645 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009646 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 int kind1, kind2, kind;
9649 void *buf1, *buf2;
9650 Py_ssize_t len1, len2;
9651 PyObject* out;
9652
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009654 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 if (PyUnicode_READY(self) == -1)
9657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009660 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009662 if (PyUnicode_IS_ASCII(self))
9663 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009664 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009665 PyUnicode_GET_LENGTH(self), maxcount
9666 );
9667 else
9668 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009669 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009670 PyUnicode_GET_LENGTH(self), maxcount
9671 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 case PyUnicode_2BYTE_KIND:
9673 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009674 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 PyUnicode_GET_LENGTH(self), maxcount
9676 );
9677 case PyUnicode_4BYTE_KIND:
9678 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009679 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 PyUnicode_GET_LENGTH(self), maxcount
9681 );
9682 default:
9683 assert(0);
9684 return NULL;
9685 }
9686
9687 if (PyUnicode_READY(substring) == -1)
9688 return NULL;
9689
9690 kind1 = PyUnicode_KIND(self);
9691 kind2 = PyUnicode_KIND(substring);
9692 kind = kind1 > kind2 ? kind1 : kind2;
9693 buf1 = PyUnicode_DATA(self);
9694 buf2 = PyUnicode_DATA(substring);
9695 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009696 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 if (!buf1)
9698 return NULL;
9699 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009700 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 if (!buf2) {
9702 if (kind1 != kind) PyMem_Free(buf1);
9703 return NULL;
9704 }
9705 len1 = PyUnicode_GET_LENGTH(self);
9706 len2 = PyUnicode_GET_LENGTH(substring);
9707
Benjamin Petersonead6b532011-12-20 17:23:42 -06009708 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009710 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9711 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009712 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009713 else
9714 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009715 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 break;
9717 case PyUnicode_2BYTE_KIND:
9718 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009719 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 break;
9721 case PyUnicode_4BYTE_KIND:
9722 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009723 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724 break;
9725 default:
9726 out = NULL;
9727 }
9728 if (kind1 != kind)
9729 PyMem_Free(buf1);
9730 if (kind2 != kind)
9731 PyMem_Free(buf2);
9732 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009733}
9734
Alexander Belopolsky40018472011-02-26 01:02:56 +00009735static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009736rsplit(PyObject *self,
9737 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009738 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009739{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 int kind1, kind2, kind;
9741 void *buf1, *buf2;
9742 Py_ssize_t len1, len2;
9743 PyObject* out;
9744
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009745 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009746 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 if (PyUnicode_READY(self) == -1)
9749 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009752 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009753 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009754 if (PyUnicode_IS_ASCII(self))
9755 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009756 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009757 PyUnicode_GET_LENGTH(self), maxcount
9758 );
9759 else
9760 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009761 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009762 PyUnicode_GET_LENGTH(self), maxcount
9763 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 case PyUnicode_2BYTE_KIND:
9765 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009766 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 PyUnicode_GET_LENGTH(self), maxcount
9768 );
9769 case PyUnicode_4BYTE_KIND:
9770 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009771 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 PyUnicode_GET_LENGTH(self), maxcount
9773 );
9774 default:
9775 assert(0);
9776 return NULL;
9777 }
9778
9779 if (PyUnicode_READY(substring) == -1)
9780 return NULL;
9781
9782 kind1 = PyUnicode_KIND(self);
9783 kind2 = PyUnicode_KIND(substring);
9784 kind = kind1 > kind2 ? kind1 : kind2;
9785 buf1 = PyUnicode_DATA(self);
9786 buf2 = PyUnicode_DATA(substring);
9787 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009788 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 if (!buf1)
9790 return NULL;
9791 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009792 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 if (!buf2) {
9794 if (kind1 != kind) PyMem_Free(buf1);
9795 return NULL;
9796 }
9797 len1 = PyUnicode_GET_LENGTH(self);
9798 len2 = PyUnicode_GET_LENGTH(substring);
9799
Benjamin Petersonead6b532011-12-20 17:23:42 -06009800 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009802 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9803 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009804 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009805 else
9806 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009807 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 break;
9809 case PyUnicode_2BYTE_KIND:
9810 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009811 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 break;
9813 case PyUnicode_4BYTE_KIND:
9814 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009815 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 break;
9817 default:
9818 out = NULL;
9819 }
9820 if (kind1 != kind)
9821 PyMem_Free(buf1);
9822 if (kind2 != kind)
9823 PyMem_Free(buf2);
9824 return out;
9825}
9826
9827static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009828anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9829 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009831 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009833 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9834 return asciilib_find(buf1, len1, buf2, len2, offset);
9835 else
9836 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 case PyUnicode_2BYTE_KIND:
9838 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9839 case PyUnicode_4BYTE_KIND:
9840 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9841 }
9842 assert(0);
9843 return -1;
9844}
9845
9846static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009847anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9848 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009850 switch (kind) {
9851 case PyUnicode_1BYTE_KIND:
9852 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9853 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9854 else
9855 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9856 case PyUnicode_2BYTE_KIND:
9857 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9858 case PyUnicode_4BYTE_KIND:
9859 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9860 }
9861 assert(0);
9862 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009863}
9864
Alexander Belopolsky40018472011-02-26 01:02:56 +00009865static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866replace(PyObject *self, PyObject *str1,
9867 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 PyObject *u;
9870 char *sbuf = PyUnicode_DATA(self);
9871 char *buf1 = PyUnicode_DATA(str1);
9872 char *buf2 = PyUnicode_DATA(str2);
9873 int srelease = 0, release1 = 0, release2 = 0;
9874 int skind = PyUnicode_KIND(self);
9875 int kind1 = PyUnicode_KIND(str1);
9876 int kind2 = PyUnicode_KIND(str2);
9877 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9878 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9879 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009880 int mayshrink;
9881 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882
9883 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009884 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009886 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887
Victor Stinner59de0ee2011-10-07 10:01:28 +02009888 if (str1 == str2)
9889 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009890 if (skind < kind1)
9891 /* substring too wide to be present */
9892 goto nothing;
9893
Victor Stinner49a0a212011-10-12 23:46:10 +02009894 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9895 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9896 /* Replacing str1 with str2 may cause a maxchar reduction in the
9897 result string. */
9898 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +02009899 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009902 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009904 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009906 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009907 Py_UCS4 u1, u2;
9908 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +01009909 Py_ssize_t index, pos;
9910 char *src;
9911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +01009913 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9914 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009915 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009918 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +02009920 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +01009922
9923 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9924 index = 0;
9925 src = sbuf;
9926 while (--maxcount)
9927 {
9928 pos++;
9929 src += pos * PyUnicode_KIND(self);
9930 slen -= pos;
9931 index += pos;
9932 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9933 if (pos < 0)
9934 break;
9935 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9936 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009937 }
9938 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 int rkind = skind;
9940 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009941 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 if (kind1 < rkind) {
9944 /* widen substring */
9945 buf1 = _PyUnicode_AsKind(str1, rkind);
9946 if (!buf1) goto error;
9947 release1 = 1;
9948 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009949 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009950 if (i < 0)
9951 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 if (rkind > kind2) {
9953 /* widen replacement */
9954 buf2 = _PyUnicode_AsKind(str2, rkind);
9955 if (!buf2) goto error;
9956 release2 = 1;
9957 }
9958 else if (rkind < kind2) {
9959 /* widen self and buf1 */
9960 rkind = kind2;
9961 if (release1) PyMem_Free(buf1);
9962 sbuf = _PyUnicode_AsKind(self, rkind);
9963 if (!sbuf) goto error;
9964 srelease = 1;
9965 buf1 = _PyUnicode_AsKind(str1, rkind);
9966 if (!buf1) goto error;
9967 release1 = 1;
9968 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009969 u = PyUnicode_New(slen, maxchar);
9970 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009972 assert(PyUnicode_KIND(u) == rkind);
9973 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009974
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009975 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009976 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009977 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009979 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009981
9982 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009983 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009984 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009985 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009986 if (i == -1)
9987 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009988 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009990 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009992 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009994 }
9995 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01009997 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 int rkind = skind;
9999 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010002 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 buf1 = _PyUnicode_AsKind(str1, rkind);
10004 if (!buf1) goto error;
10005 release1 = 1;
10006 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010007 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010008 if (n == 0)
10009 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010011 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 buf2 = _PyUnicode_AsKind(str2, rkind);
10013 if (!buf2) goto error;
10014 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010017 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 rkind = kind2;
10019 sbuf = _PyUnicode_AsKind(self, rkind);
10020 if (!sbuf) goto error;
10021 srelease = 1;
10022 if (release1) PyMem_Free(buf1);
10023 buf1 = _PyUnicode_AsKind(str1, rkind);
10024 if (!buf1) goto error;
10025 release1 = 1;
10026 }
10027 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10028 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010029 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 PyErr_SetString(PyExc_OverflowError,
10031 "replace string is too long");
10032 goto error;
10033 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010034 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010035 if (new_size == 0) {
10036 Py_INCREF(unicode_empty);
10037 u = unicode_empty;
10038 goto done;
10039 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010040 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 PyErr_SetString(PyExc_OverflowError,
10042 "replace string is too long");
10043 goto error;
10044 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010045 u = PyUnicode_New(new_size, maxchar);
10046 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010048 assert(PyUnicode_KIND(u) == rkind);
10049 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 ires = i = 0;
10051 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010052 while (n-- > 0) {
10053 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010054 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010055 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010056 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010057 if (j == -1)
10058 break;
10059 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010060 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010061 memcpy(res + rkind * ires,
10062 sbuf + rkind * i,
10063 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010065 }
10066 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010068 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010070 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010072 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010076 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010077 memcpy(res + rkind * ires,
10078 sbuf + rkind * i,
10079 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010080 }
10081 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010082 /* interleave */
10083 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010084 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010086 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010088 if (--n <= 0)
10089 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010090 memcpy(res + rkind * ires,
10091 sbuf + rkind * i,
10092 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 ires++;
10094 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010095 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010096 memcpy(res + rkind * ires,
10097 sbuf + rkind * i,
10098 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010099 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010100 }
10101
10102 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010103 unicode_adjust_maxchar(&u);
10104 if (u == NULL)
10105 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010107
10108 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 if (srelease)
10110 PyMem_FREE(sbuf);
10111 if (release1)
10112 PyMem_FREE(buf1);
10113 if (release2)
10114 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010115 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010117
Benjamin Peterson29060642009-01-31 22:14:21 +000010118 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010119 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 if (srelease)
10121 PyMem_FREE(sbuf);
10122 if (release1)
10123 PyMem_FREE(buf1);
10124 if (release2)
10125 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010126 return unicode_result_unchanged(self);
10127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 error:
10129 if (srelease && sbuf)
10130 PyMem_FREE(sbuf);
10131 if (release1 && buf1)
10132 PyMem_FREE(buf1);
10133 if (release2 && buf2)
10134 PyMem_FREE(buf2);
10135 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136}
10137
10138/* --- Unicode Object Methods --------------------------------------------- */
10139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010140PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010141 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142\n\
10143Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010144characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145
10146static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010147unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010149 if (PyUnicode_READY(self) == -1)
10150 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010151 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152}
10153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010154PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010155 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156\n\
10157Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010158have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159
10160static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010161unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010163 if (PyUnicode_READY(self) == -1)
10164 return NULL;
10165 if (PyUnicode_GET_LENGTH(self) == 0)
10166 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010167 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168}
10169
Benjamin Petersond5890c82012-01-14 13:23:30 -050010170PyDoc_STRVAR(casefold__doc__,
10171 "S.casefold() -> str\n\
10172\n\
10173Return a version of S suitable for caseless comparisons.");
10174
10175static PyObject *
10176unicode_casefold(PyObject *self)
10177{
10178 if (PyUnicode_READY(self) == -1)
10179 return NULL;
10180 if (PyUnicode_IS_ASCII(self))
10181 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010182 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010183}
10184
10185
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010186/* Argument converter. Coerces to a single unicode character */
10187
10188static int
10189convert_uc(PyObject *obj, void *addr)
10190{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010192 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010193
Benjamin Peterson14339b62009-01-31 16:36:08 +000010194 uniobj = PyUnicode_FromObject(obj);
10195 if (uniobj == NULL) {
10196 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010197 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010198 return 0;
10199 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010201 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010202 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010203 Py_DECREF(uniobj);
10204 return 0;
10205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010207 Py_DECREF(uniobj);
10208 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010209}
10210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010211PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010212 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010214Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010215done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216
10217static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010218unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010220 Py_ssize_t marg, left;
10221 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 Py_UCS4 fillchar = ' ';
10223
Victor Stinnere9a29352011-10-01 02:14:59 +020010224 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226
Benjamin Petersonbac79492012-01-14 13:34:47 -050010227 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228 return NULL;
10229
Victor Stinnerc4b49542011-12-11 22:44:26 +010010230 if (PyUnicode_GET_LENGTH(self) >= width)
10231 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232
Victor Stinnerc4b49542011-12-11 22:44:26 +010010233 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234 left = marg / 2 + (marg & width & 1);
10235
Victor Stinner9310abb2011-10-05 00:59:23 +020010236 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237}
10238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239/* This function assumes that str1 and str2 are readied by the caller. */
10240
Marc-André Lemburge5034372000-08-08 08:04:29 +000010241static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010242unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010243{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 int kind1, kind2;
10245 void *data1, *data2;
Victor Stinner770e19e2012-10-04 22:59:45 +020010246 Py_ssize_t len1, len2;
10247 Py_ssize_t i, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010248
Victor Stinner90db9c42012-10-04 21:53:50 +020010249 /* a string is equal to itself */
10250 if (str1 == str2)
10251 return 0;
10252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 kind1 = PyUnicode_KIND(str1);
10254 kind2 = PyUnicode_KIND(str2);
10255 data1 = PyUnicode_DATA(str1);
10256 data2 = PyUnicode_DATA(str2);
10257 len1 = PyUnicode_GET_LENGTH(str1);
10258 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010259 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010260
Victor Stinner770e19e2012-10-04 22:59:45 +020010261 if (kind1 == 1 && kind2 == 1) {
10262 int cmp = memcmp(data1, data2, len);
10263 /* normalize result of memcmp() into the range [-1; 1] */
10264 if (cmp < 0)
10265 return -1;
10266 if (cmp > 0)
10267 return 1;
10268 }
10269 else {
10270 for (i = 0; i < len; ++i) {
10271 Py_UCS4 c1, c2;
10272 c1 = PyUnicode_READ(kind1, data1, i);
10273 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010274
Victor Stinner770e19e2012-10-04 22:59:45 +020010275 if (c1 != c2)
10276 return (c1 < c2) ? -1 : 1;
10277 }
Marc-André Lemburge5034372000-08-08 08:04:29 +000010278 }
10279
Victor Stinner770e19e2012-10-04 22:59:45 +020010280 if (len1 == len2)
10281 return 0;
10282 if (len1 < len2)
10283 return -1;
10284 else
10285 return 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010286}
10287
Victor Stinnere5567ad2012-10-23 02:48:49 +020010288static int
10289unicode_compare_eq(PyObject *str1, PyObject *str2)
10290{
10291 int kind;
10292 void *data1, *data2;
10293 Py_ssize_t len;
10294 int cmp;
10295
10296 /* a string is equal to itself */
10297 if (str1 == str2)
10298 return 1;
10299
10300 len = PyUnicode_GET_LENGTH(str1);
10301 if (PyUnicode_GET_LENGTH(str2) != len)
10302 return 0;
10303 kind = PyUnicode_KIND(str1);
10304 if (PyUnicode_KIND(str2) != kind)
10305 return 0;
10306 data1 = PyUnicode_DATA(str1);
10307 data2 = PyUnicode_DATA(str2);
10308
10309 cmp = memcmp(data1, data2, len * kind);
10310 return (cmp == 0);
10311}
10312
10313
Alexander Belopolsky40018472011-02-26 01:02:56 +000010314int
10315PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10318 if (PyUnicode_READY(left) == -1 ||
10319 PyUnicode_READY(right) == -1)
10320 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010321 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010323 PyErr_Format(PyExc_TypeError,
10324 "Can't compare %.100s and %.100s",
10325 left->ob_type->tp_name,
10326 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327 return -1;
10328}
10329
Martin v. Löwis5b222132007-06-10 09:51:05 +000010330int
10331PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10332{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 Py_ssize_t i;
10334 int kind;
10335 void *data;
10336 Py_UCS4 chr;
10337
Victor Stinner910337b2011-10-03 03:20:16 +020010338 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 if (PyUnicode_READY(uni) == -1)
10340 return -1;
10341 kind = PyUnicode_KIND(uni);
10342 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010343 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10345 if (chr != str[i])
10346 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010347 /* This check keeps Python strings that end in '\0' from comparing equal
10348 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010350 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010351 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010352 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010353 return 0;
10354}
10355
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010356
Benjamin Peterson29060642009-01-31 22:14:21 +000010357#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010358 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010359
Alexander Belopolsky40018472011-02-26 01:02:56 +000010360PyObject *
10361PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010362{
10363 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010364 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010365
Victor Stinnere5567ad2012-10-23 02:48:49 +020010366 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10367 Py_RETURN_NOTIMPLEMENTED;
10368
10369 if (PyUnicode_READY(left) == -1 ||
10370 PyUnicode_READY(right) == -1)
10371 return NULL;
10372
10373 if (op == Py_EQ || op == Py_NE) {
10374 result = unicode_compare_eq(left, right);
10375 if (op == Py_EQ)
10376 v = TEST_COND(result);
10377 else
10378 v = TEST_COND(!result);
10379 }
10380 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010381 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010382
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010383 /* Convert the return value to a Boolean */
10384 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010385 case Py_LE:
10386 v = TEST_COND(result <= 0);
10387 break;
10388 case Py_GE:
10389 v = TEST_COND(result >= 0);
10390 break;
10391 case Py_LT:
10392 v = TEST_COND(result == -1);
10393 break;
10394 case Py_GT:
10395 v = TEST_COND(result == 1);
10396 break;
10397 default:
10398 PyErr_BadArgument();
10399 return NULL;
10400 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010401 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010402 Py_INCREF(v);
10403 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010404}
10405
Alexander Belopolsky40018472011-02-26 01:02:56 +000010406int
10407PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010408{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010409 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 int kind1, kind2, kind;
10411 void *buf1, *buf2;
10412 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010413 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010414
10415 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010416 sub = PyUnicode_FromObject(element);
10417 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010418 PyErr_Format(PyExc_TypeError,
10419 "'in <string>' requires string as left operand, not %s",
10420 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010421 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010422 }
10423
Thomas Wouters477c8d52006-05-27 19:21:47 +000010424 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010425 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010426 Py_DECREF(sub);
10427 return -1;
10428 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010429 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10430 Py_DECREF(sub);
10431 Py_DECREF(str);
10432 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 kind1 = PyUnicode_KIND(str);
10435 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010436 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 buf1 = PyUnicode_DATA(str);
10438 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010439 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010440 if (kind2 > kind) {
10441 Py_DECREF(sub);
10442 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010443 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010444 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010445 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 if (!buf2) {
10448 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010449 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 return -1;
10451 }
10452 len1 = PyUnicode_GET_LENGTH(str);
10453 len2 = PyUnicode_GET_LENGTH(sub);
10454
Benjamin Petersonead6b532011-12-20 17:23:42 -060010455 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 case PyUnicode_1BYTE_KIND:
10457 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10458 break;
10459 case PyUnicode_2BYTE_KIND:
10460 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10461 break;
10462 case PyUnicode_4BYTE_KIND:
10463 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10464 break;
10465 default:
10466 result = -1;
10467 assert(0);
10468 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010469
10470 Py_DECREF(str);
10471 Py_DECREF(sub);
10472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 if (kind2 != kind)
10474 PyMem_Free(buf2);
10475
Guido van Rossum403d68b2000-03-13 15:55:09 +000010476 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010477}
10478
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479/* Concat to string or Unicode object giving a new Unicode object. */
10480
Alexander Belopolsky40018472011-02-26 01:02:56 +000010481PyObject *
10482PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010485 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010486 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487
10488 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010491 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010494 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495
10496 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010497 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010498 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010501 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010502 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010504 }
10505
Victor Stinner488fa492011-12-12 00:01:39 +010010506 u_len = PyUnicode_GET_LENGTH(u);
10507 v_len = PyUnicode_GET_LENGTH(v);
10508 if (u_len > PY_SSIZE_T_MAX - v_len) {
10509 PyErr_SetString(PyExc_OverflowError,
10510 "strings are too large to concat");
10511 goto onError;
10512 }
10513 new_len = u_len + v_len;
10514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010516 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010517 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010520 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010522 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010523 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10524 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525 Py_DECREF(u);
10526 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010527 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529
Benjamin Peterson29060642009-01-31 22:14:21 +000010530 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531 Py_XDECREF(u);
10532 Py_XDECREF(v);
10533 return NULL;
10534}
10535
Walter Dörwald1ab83302007-05-18 17:15:44 +000010536void
Victor Stinner23e56682011-10-03 03:54:37 +020010537PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010538{
Victor Stinner23e56682011-10-03 03:54:37 +020010539 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010540 Py_UCS4 maxchar, maxchar2;
10541 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010542
10543 if (p_left == NULL) {
10544 if (!PyErr_Occurred())
10545 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010546 return;
10547 }
Victor Stinner23e56682011-10-03 03:54:37 +020010548 left = *p_left;
10549 if (right == NULL || !PyUnicode_Check(left)) {
10550 if (!PyErr_Occurred())
10551 PyErr_BadInternalCall();
10552 goto error;
10553 }
10554
Benjamin Petersonbac79492012-01-14 13:34:47 -050010555 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010556 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010557 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010558 goto error;
10559
Victor Stinner488fa492011-12-12 00:01:39 +010010560 /* Shortcuts */
10561 if (left == unicode_empty) {
10562 Py_DECREF(left);
10563 Py_INCREF(right);
10564 *p_left = right;
10565 return;
10566 }
10567 if (right == unicode_empty)
10568 return;
10569
10570 left_len = PyUnicode_GET_LENGTH(left);
10571 right_len = PyUnicode_GET_LENGTH(right);
10572 if (left_len > PY_SSIZE_T_MAX - right_len) {
10573 PyErr_SetString(PyExc_OverflowError,
10574 "strings are too large to concat");
10575 goto error;
10576 }
10577 new_len = left_len + right_len;
10578
10579 if (unicode_modifiable(left)
10580 && PyUnicode_CheckExact(right)
10581 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010582 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10583 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010584 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010585 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010586 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10587 {
10588 /* append inplace */
10589 if (unicode_resize(p_left, new_len) != 0) {
10590 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10591 * deallocated so it cannot be put back into
10592 * 'variable'. The MemoryError is raised when there
10593 * is no value in 'variable', which might (very
10594 * remotely) be a cause of incompatibilities.
10595 */
10596 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010597 }
Victor Stinner488fa492011-12-12 00:01:39 +010010598 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010599 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010600 }
Victor Stinner488fa492011-12-12 00:01:39 +010010601 else {
10602 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10603 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010604 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010605
Victor Stinner488fa492011-12-12 00:01:39 +010010606 /* Concat the two Unicode strings */
10607 res = PyUnicode_New(new_len, maxchar);
10608 if (res == NULL)
10609 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010610 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10611 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010612 Py_DECREF(left);
10613 *p_left = res;
10614 }
10615 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010616 return;
10617
10618error:
Victor Stinner488fa492011-12-12 00:01:39 +010010619 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010620}
10621
10622void
10623PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10624{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010625 PyUnicode_Append(pleft, right);
10626 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010627}
10628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010629PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010630 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010631\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010632Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010633string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010634interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635
10636static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010637unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010639 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010640 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010641 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 int kind1, kind2, kind;
10644 void *buf1, *buf2;
10645 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646
Jesus Ceaac451502011-04-20 17:09:23 +020010647 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10648 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010649 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 kind1 = PyUnicode_KIND(self);
10652 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010653 if (kind2 > kind1)
10654 return PyLong_FromLong(0);
10655 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 buf1 = PyUnicode_DATA(self);
10657 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010659 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 if (!buf2) {
10661 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 return NULL;
10663 }
10664 len1 = PyUnicode_GET_LENGTH(self);
10665 len2 = PyUnicode_GET_LENGTH(substring);
10666
10667 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010668 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 case PyUnicode_1BYTE_KIND:
10670 iresult = ucs1lib_count(
10671 ((Py_UCS1*)buf1) + start, end - start,
10672 buf2, len2, PY_SSIZE_T_MAX
10673 );
10674 break;
10675 case PyUnicode_2BYTE_KIND:
10676 iresult = ucs2lib_count(
10677 ((Py_UCS2*)buf1) + start, end - start,
10678 buf2, len2, PY_SSIZE_T_MAX
10679 );
10680 break;
10681 case PyUnicode_4BYTE_KIND:
10682 iresult = ucs4lib_count(
10683 ((Py_UCS4*)buf1) + start, end - start,
10684 buf2, len2, PY_SSIZE_T_MAX
10685 );
10686 break;
10687 default:
10688 assert(0); iresult = 0;
10689 }
10690
10691 result = PyLong_FromSsize_t(iresult);
10692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 if (kind2 != kind)
10694 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695
10696 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698 return result;
10699}
10700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010701PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010702 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010704Encode S using the codec registered for encoding. Default encoding\n\
10705is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010706handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010707a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10708'xmlcharrefreplace' as well as any other name registered with\n\
10709codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710
10711static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010712unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010714 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715 char *encoding = NULL;
10716 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010717
Benjamin Peterson308d6372009-09-18 21:42:35 +000010718 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10719 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010721 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010722}
10723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010724PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726\n\
10727Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010728If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729
10730static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010731unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010733 Py_ssize_t i, j, line_pos, src_len, incr;
10734 Py_UCS4 ch;
10735 PyObject *u;
10736 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010738 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010739 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740
10741 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743
Antoine Pitrou22425222011-10-04 19:10:51 +020010744 if (PyUnicode_READY(self) == -1)
10745 return NULL;
10746
Thomas Wouters7e474022000-07-16 12:04:32 +000010747 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010748 src_len = PyUnicode_GET_LENGTH(self);
10749 i = j = line_pos = 0;
10750 kind = PyUnicode_KIND(self);
10751 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010752 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010753 for (; i < src_len; i++) {
10754 ch = PyUnicode_READ(kind, src_data, i);
10755 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010756 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010757 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010758 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010759 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010760 goto overflow;
10761 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010762 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010763 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010766 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010767 goto overflow;
10768 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010770 if (ch == '\n' || ch == '\r')
10771 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010773 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010774 if (!found)
10775 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010776
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010778 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779 if (!u)
10780 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010781 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782
Antoine Pitroue71d5742011-10-04 15:55:09 +020010783 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784
Antoine Pitroue71d5742011-10-04 15:55:09 +020010785 for (; i < src_len; i++) {
10786 ch = PyUnicode_READ(kind, src_data, i);
10787 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010789 incr = tabsize - (line_pos % tabsize);
10790 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010791 FILL(kind, dest_data, ' ', j, incr);
10792 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010793 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010794 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010795 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010796 line_pos++;
10797 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010798 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010799 if (ch == '\n' || ch == '\r')
10800 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010802 }
10803 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010804 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010805
Antoine Pitroue71d5742011-10-04 15:55:09 +020010806 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010807 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809}
10810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010811PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010812 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813\n\
10814Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010815such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816arguments start and end are interpreted as in slice notation.\n\
10817\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010818Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819
10820static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010823 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010824 Py_ssize_t start;
10825 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010826 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827
Jesus Ceaac451502011-04-20 17:09:23 +020010828 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10829 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 if (PyUnicode_READY(self) == -1)
10833 return NULL;
10834 if (PyUnicode_READY(substring) == -1)
10835 return NULL;
10836
Victor Stinner7931d9a2011-11-04 00:22:48 +010010837 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838
10839 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (result == -2)
10842 return NULL;
10843
Christian Heimes217cfd12007-12-02 14:31:20 +000010844 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845}
10846
10847static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010848unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010850 void *data;
10851 enum PyUnicode_Kind kind;
10852 Py_UCS4 ch;
10853 PyObject *res;
10854
10855 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10856 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010858 }
10859 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10860 PyErr_SetString(PyExc_IndexError, "string index out of range");
10861 return NULL;
10862 }
10863 kind = PyUnicode_KIND(self);
10864 data = PyUnicode_DATA(self);
10865 ch = PyUnicode_READ(kind, data, index);
10866 if (ch < 256)
10867 return get_latin1_char(ch);
10868
10869 res = PyUnicode_New(1, ch);
10870 if (res == NULL)
10871 return NULL;
10872 kind = PyUnicode_KIND(res);
10873 data = PyUnicode_DATA(res);
10874 PyUnicode_WRITE(kind, data, 0, ch);
10875 assert(_PyUnicode_CheckConsistency(res, 1));
10876 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877}
10878
Guido van Rossumc2504932007-09-18 19:42:40 +000010879/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010880 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010881static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010882unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883{
Guido van Rossumc2504932007-09-18 19:42:40 +000010884 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010885 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010886
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010887#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050010888 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010889#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 if (_PyUnicode_HASH(self) != -1)
10891 return _PyUnicode_HASH(self);
10892 if (PyUnicode_READY(self) == -1)
10893 return -1;
10894 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010010895 /*
10896 We make the hash of the empty string be 0, rather than using
10897 (prefix ^ suffix), since this slightly obfuscates the hash secret
10898 */
10899 if (len == 0) {
10900 _PyUnicode_HASH(self) = 0;
10901 return 0;
10902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903
10904 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010010905#define HASH(P) \
10906 x ^= (Py_uhash_t) *P << 7; \
10907 while (--len >= 0) \
10908 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909
Georg Brandl2fb477c2012-02-21 00:33:36 +010010910 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010911 switch (PyUnicode_KIND(self)) {
10912 case PyUnicode_1BYTE_KIND: {
10913 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10914 HASH(c);
10915 break;
10916 }
10917 case PyUnicode_2BYTE_KIND: {
10918 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10919 HASH(s);
10920 break;
10921 }
10922 default: {
10923 Py_UCS4 *l;
10924 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10925 "Impossible switch case in unicode_hash");
10926 l = PyUnicode_4BYTE_DATA(self);
10927 HASH(l);
10928 break;
10929 }
10930 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010010931 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
10932 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933
Guido van Rossumc2504932007-09-18 19:42:40 +000010934 if (x == -1)
10935 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010937 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010941PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010942 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010944Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945
10946static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010949 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010950 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010951 Py_ssize_t start;
10952 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953
Jesus Ceaac451502011-04-20 17:09:23 +020010954 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10955 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 if (PyUnicode_READY(self) == -1)
10959 return NULL;
10960 if (PyUnicode_READY(substring) == -1)
10961 return NULL;
10962
Victor Stinner7931d9a2011-11-04 00:22:48 +010010963 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
10965 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 if (result == -2)
10968 return NULL;
10969
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970 if (result < 0) {
10971 PyErr_SetString(PyExc_ValueError, "substring not found");
10972 return NULL;
10973 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010974
Christian Heimes217cfd12007-12-02 14:31:20 +000010975 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976}
10977
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010978PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010981Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010982at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983
10984static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010985unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 Py_ssize_t i, length;
10988 int kind;
10989 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990 int cased;
10991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 if (PyUnicode_READY(self) == -1)
10993 return NULL;
10994 length = PyUnicode_GET_LENGTH(self);
10995 kind = PyUnicode_KIND(self);
10996 data = PyUnicode_DATA(self);
10997
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 if (length == 1)
11000 return PyBool_FromLong(
11001 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011003 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011005 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011006
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 for (i = 0; i < length; i++) {
11009 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011010
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11012 return PyBool_FromLong(0);
11013 else if (!cased && Py_UNICODE_ISLOWER(ch))
11014 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011016 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017}
11018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011019PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011022Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011023at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024
11025static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011026unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 Py_ssize_t i, length;
11029 int kind;
11030 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 int cased;
11032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 if (PyUnicode_READY(self) == -1)
11034 return NULL;
11035 length = PyUnicode_GET_LENGTH(self);
11036 kind = PyUnicode_KIND(self);
11037 data = PyUnicode_DATA(self);
11038
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 if (length == 1)
11041 return PyBool_FromLong(
11042 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011044 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011046 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011047
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 for (i = 0; i < length; i++) {
11050 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011051
Benjamin Peterson29060642009-01-31 22:14:21 +000011052 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11053 return PyBool_FromLong(0);
11054 else if (!cased && Py_UNICODE_ISUPPER(ch))
11055 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011057 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058}
11059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011060PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011061 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011063Return True if S is a titlecased string and there is at least one\n\
11064character in S, i.e. upper- and titlecase characters may only\n\
11065follow uncased characters and lowercase characters only cased ones.\n\
11066Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
11068static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011069unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 Py_ssize_t i, length;
11072 int kind;
11073 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074 int cased, previous_is_cased;
11075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 if (PyUnicode_READY(self) == -1)
11077 return NULL;
11078 length = PyUnicode_GET_LENGTH(self);
11079 kind = PyUnicode_KIND(self);
11080 data = PyUnicode_DATA(self);
11081
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 if (length == 1) {
11084 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11085 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11086 (Py_UNICODE_ISUPPER(ch) != 0));
11087 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011089 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011091 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011092
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093 cased = 0;
11094 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 for (i = 0; i < length; i++) {
11096 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011097
Benjamin Peterson29060642009-01-31 22:14:21 +000011098 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11099 if (previous_is_cased)
11100 return PyBool_FromLong(0);
11101 previous_is_cased = 1;
11102 cased = 1;
11103 }
11104 else if (Py_UNICODE_ISLOWER(ch)) {
11105 if (!previous_is_cased)
11106 return PyBool_FromLong(0);
11107 previous_is_cased = 1;
11108 cased = 1;
11109 }
11110 else
11111 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011113 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114}
11115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011116PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011119Return True if all characters in S are whitespace\n\
11120and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121
11122static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011123unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011125 Py_ssize_t i, length;
11126 int kind;
11127 void *data;
11128
11129 if (PyUnicode_READY(self) == -1)
11130 return NULL;
11131 length = PyUnicode_GET_LENGTH(self);
11132 kind = PyUnicode_KIND(self);
11133 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 if (length == 1)
11137 return PyBool_FromLong(
11138 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011140 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011142 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 for (i = 0; i < length; i++) {
11145 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011146 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011147 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011149 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150}
11151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011152PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011153 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011154\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011155Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011156and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011157
11158static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011159unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011160{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161 Py_ssize_t i, length;
11162 int kind;
11163 void *data;
11164
11165 if (PyUnicode_READY(self) == -1)
11166 return NULL;
11167 length = PyUnicode_GET_LENGTH(self);
11168 kind = PyUnicode_KIND(self);
11169 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011170
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011171 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 if (length == 1)
11173 return PyBool_FromLong(
11174 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011175
11176 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011178 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 for (i = 0; i < length; i++) {
11181 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011182 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011183 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011184 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011185}
11186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011187PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011188 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011189\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011190Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011191and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011192
11193static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011194unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 int kind;
11197 void *data;
11198 Py_ssize_t len, i;
11199
11200 if (PyUnicode_READY(self) == -1)
11201 return NULL;
11202
11203 kind = PyUnicode_KIND(self);
11204 data = PyUnicode_DATA(self);
11205 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011206
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011207 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 if (len == 1) {
11209 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11210 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11211 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011212
11213 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011215 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 for (i = 0; i < len; i++) {
11218 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011219 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011220 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011221 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011222 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011223}
11224
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011225PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011226 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011228Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011229False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230
11231static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011232unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 Py_ssize_t i, length;
11235 int kind;
11236 void *data;
11237
11238 if (PyUnicode_READY(self) == -1)
11239 return NULL;
11240 length = PyUnicode_GET_LENGTH(self);
11241 kind = PyUnicode_KIND(self);
11242 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 if (length == 1)
11246 return PyBool_FromLong(
11247 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011249 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011251 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 for (i = 0; i < length; i++) {
11254 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011255 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011257 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258}
11259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011260PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011263Return True if all characters in S are digits\n\
11264and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265
11266static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011267unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011269 Py_ssize_t i, length;
11270 int kind;
11271 void *data;
11272
11273 if (PyUnicode_READY(self) == -1)
11274 return NULL;
11275 length = PyUnicode_GET_LENGTH(self);
11276 kind = PyUnicode_KIND(self);
11277 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011280 if (length == 1) {
11281 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11282 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011285 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289 for (i = 0; i < length; i++) {
11290 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011291 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011293 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294}
11295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011296PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011297 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011299Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011300False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301
11302static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011303unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305 Py_ssize_t i, length;
11306 int kind;
11307 void *data;
11308
11309 if (PyUnicode_READY(self) == -1)
11310 return NULL;
11311 length = PyUnicode_GET_LENGTH(self);
11312 kind = PyUnicode_KIND(self);
11313 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 if (length == 1)
11317 return PyBool_FromLong(
11318 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011320 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011322 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 for (i = 0; i < length; i++) {
11325 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011328 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329}
11330
Martin v. Löwis47383402007-08-15 07:32:56 +000011331int
11332PyUnicode_IsIdentifier(PyObject *self)
11333{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 int kind;
11335 void *data;
11336 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011337 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 if (PyUnicode_READY(self) == -1) {
11340 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011341 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 }
11343
11344 /* Special case for empty strings */
11345 if (PyUnicode_GET_LENGTH(self) == 0)
11346 return 0;
11347 kind = PyUnicode_KIND(self);
11348 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011349
11350 /* PEP 3131 says that the first character must be in
11351 XID_Start and subsequent characters in XID_Continue,
11352 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011353 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011354 letters, digits, underscore). However, given the current
11355 definition of XID_Start and XID_Continue, it is sufficient
11356 to check just for these, except that _ must be allowed
11357 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011359 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011360 return 0;
11361
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011362 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011364 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011365 return 1;
11366}
11367
11368PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011370\n\
11371Return True if S is a valid identifier according\n\
11372to the language definition.");
11373
11374static PyObject*
11375unicode_isidentifier(PyObject *self)
11376{
11377 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11378}
11379
Georg Brandl559e5d72008-06-11 18:37:52 +000011380PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011381 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011382\n\
11383Return True if all characters in S are considered\n\
11384printable in repr() or S is empty, False otherwise.");
11385
11386static PyObject*
11387unicode_isprintable(PyObject *self)
11388{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 Py_ssize_t i, length;
11390 int kind;
11391 void *data;
11392
11393 if (PyUnicode_READY(self) == -1)
11394 return NULL;
11395 length = PyUnicode_GET_LENGTH(self);
11396 kind = PyUnicode_KIND(self);
11397 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011398
11399 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 if (length == 1)
11401 return PyBool_FromLong(
11402 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 for (i = 0; i < length; i++) {
11405 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011406 Py_RETURN_FALSE;
11407 }
11408 }
11409 Py_RETURN_TRUE;
11410}
11411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011412PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011413 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414\n\
11415Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011416iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417
11418static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011419unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011421 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422}
11423
Martin v. Löwis18e16552006-02-15 17:27:45 +000011424static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011425unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 if (PyUnicode_READY(self) == -1)
11428 return -1;
11429 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430}
11431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011432PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011433 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011435Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011436done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437
11438static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011439unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011441 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 Py_UCS4 fillchar = ' ';
11443
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011444 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445 return NULL;
11446
Benjamin Petersonbac79492012-01-14 13:34:47 -050011447 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449
Victor Stinnerc4b49542011-12-11 22:44:26 +010011450 if (PyUnicode_GET_LENGTH(self) >= width)
11451 return unicode_result_unchanged(self);
11452
11453 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454}
11455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011456PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011457 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011459Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460
11461static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011462unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011464 if (PyUnicode_READY(self) == -1)
11465 return NULL;
11466 if (PyUnicode_IS_ASCII(self))
11467 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011468 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469}
11470
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011471#define LEFTSTRIP 0
11472#define RIGHTSTRIP 1
11473#define BOTHSTRIP 2
11474
11475/* Arrays indexed by above */
11476static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11477
11478#define STRIPNAME(i) (stripformat[i]+3)
11479
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011480/* externally visible for str.strip(unicode) */
11481PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011482_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 void *data;
11485 int kind;
11486 Py_ssize_t i, j, len;
11487 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11490 return NULL;
11491
11492 kind = PyUnicode_KIND(self);
11493 data = PyUnicode_DATA(self);
11494 len = PyUnicode_GET_LENGTH(self);
11495 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11496 PyUnicode_DATA(sepobj),
11497 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011498
Benjamin Peterson14339b62009-01-31 16:36:08 +000011499 i = 0;
11500 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 while (i < len &&
11502 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 i++;
11504 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011505 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011506
Benjamin Peterson14339b62009-01-31 16:36:08 +000011507 j = len;
11508 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 do {
11510 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 } while (j >= i &&
11512 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011513 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011514 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011515
Victor Stinner7931d9a2011-11-04 00:22:48 +010011516 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517}
11518
11519PyObject*
11520PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11521{
11522 unsigned char *data;
11523 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011524 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525
Victor Stinnerde636f32011-10-01 03:55:54 +020011526 if (PyUnicode_READY(self) == -1)
11527 return NULL;
11528
Victor Stinner684d5fd2012-05-03 02:32:34 +020011529 length = PyUnicode_GET_LENGTH(self);
11530 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011531
Victor Stinner684d5fd2012-05-03 02:32:34 +020011532 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011533 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534
Victor Stinnerde636f32011-10-01 03:55:54 +020011535 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011536 PyErr_SetString(PyExc_IndexError, "string index out of range");
11537 return NULL;
11538 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011539 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011540 Py_INCREF(unicode_empty);
11541 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011542 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011543
Victor Stinner684d5fd2012-05-03 02:32:34 +020011544 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011545 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011546 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011547 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011548 }
11549 else {
11550 kind = PyUnicode_KIND(self);
11551 data = PyUnicode_1BYTE_DATA(self);
11552 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011553 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011554 length);
11555 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557
11558static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011559do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 int kind;
11562 void *data;
11563 Py_ssize_t len, i, j;
11564
11565 if (PyUnicode_READY(self) == -1)
11566 return NULL;
11567
11568 kind = PyUnicode_KIND(self);
11569 data = PyUnicode_DATA(self);
11570 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011571
Benjamin Peterson14339b62009-01-31 16:36:08 +000011572 i = 0;
11573 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011575 i++;
11576 }
11577 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011578
Benjamin Peterson14339b62009-01-31 16:36:08 +000011579 j = len;
11580 if (striptype != LEFTSTRIP) {
11581 do {
11582 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011584 j++;
11585 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011586
Victor Stinner7931d9a2011-11-04 00:22:48 +010011587 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588}
11589
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011590
11591static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011592do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011593{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011594 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011595
Benjamin Peterson14339b62009-01-31 16:36:08 +000011596 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11597 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011598
Benjamin Peterson14339b62009-01-31 16:36:08 +000011599 if (sep != NULL && sep != Py_None) {
11600 if (PyUnicode_Check(sep))
11601 return _PyUnicode_XStrip(self, striptype, sep);
11602 else {
11603 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011604 "%s arg must be None or str",
11605 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011606 return NULL;
11607 }
11608 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011609
Benjamin Peterson14339b62009-01-31 16:36:08 +000011610 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011611}
11612
11613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011614PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011615 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011616\n\
11617Return a copy of the string S with leading and trailing\n\
11618whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011619If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011620
11621static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011622unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011623{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011624 if (PyTuple_GET_SIZE(args) == 0)
11625 return do_strip(self, BOTHSTRIP); /* Common case */
11626 else
11627 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011628}
11629
11630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011631PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011632 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011633\n\
11634Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011635If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011636
11637static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011638unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011639{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011640 if (PyTuple_GET_SIZE(args) == 0)
11641 return do_strip(self, LEFTSTRIP); /* Common case */
11642 else
11643 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011644}
11645
11646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011647PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011648 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011649\n\
11650Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011651If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011652
11653static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011654unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011655{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011656 if (PyTuple_GET_SIZE(args) == 0)
11657 return do_strip(self, RIGHTSTRIP); /* Common case */
11658 else
11659 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011660}
11661
11662
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011664unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011666 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668
Georg Brandl222de0f2009-04-12 12:01:50 +000011669 if (len < 1) {
11670 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011671 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011672 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673
Victor Stinnerc4b49542011-12-11 22:44:26 +010011674 /* no repeat, return original string */
11675 if (len == 1)
11676 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011677
Benjamin Petersonbac79492012-01-14 13:34:47 -050011678 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 return NULL;
11680
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011681 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011682 PyErr_SetString(PyExc_OverflowError,
11683 "repeated string is too long");
11684 return NULL;
11685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011687
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011688 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689 if (!u)
11690 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011691 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 if (PyUnicode_GET_LENGTH(str) == 1) {
11694 const int kind = PyUnicode_KIND(str);
11695 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011696 if (kind == PyUnicode_1BYTE_KIND) {
11697 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011698 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011699 }
11700 else if (kind == PyUnicode_2BYTE_KIND) {
11701 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011702 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011703 ucs2[n] = fill_char;
11704 } else {
11705 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11706 assert(kind == PyUnicode_4BYTE_KIND);
11707 for (n = 0; n < len; ++n)
11708 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 }
11711 else {
11712 /* number of characters copied this far */
11713 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011714 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 char *to = (char *) PyUnicode_DATA(u);
11716 Py_MEMCPY(to, PyUnicode_DATA(str),
11717 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011718 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 n = (done <= nchars-done) ? done : nchars-done;
11720 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011721 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723 }
11724
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011725 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011726 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727}
11728
Alexander Belopolsky40018472011-02-26 01:02:56 +000011729PyObject *
11730PyUnicode_Replace(PyObject *obj,
11731 PyObject *subobj,
11732 PyObject *replobj,
11733 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734{
11735 PyObject *self;
11736 PyObject *str1;
11737 PyObject *str2;
11738 PyObject *result;
11739
11740 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011741 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011744 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 Py_DECREF(self);
11746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747 }
11748 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011749 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011750 Py_DECREF(self);
11751 Py_DECREF(str1);
11752 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011754 if (PyUnicode_READY(self) == -1 ||
11755 PyUnicode_READY(str1) == -1 ||
11756 PyUnicode_READY(str2) == -1)
11757 result = NULL;
11758 else
11759 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760 Py_DECREF(self);
11761 Py_DECREF(str1);
11762 Py_DECREF(str2);
11763 return result;
11764}
11765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011766PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011767 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768\n\
11769Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011770old replaced by new. If the optional argument count is\n\
11771given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772
11773static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 PyObject *str1;
11777 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011778 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779 PyObject *result;
11780
Martin v. Löwis18e16552006-02-15 17:27:45 +000011781 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011783 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011784 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011786 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 return NULL;
11788 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011789 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011790 Py_DECREF(str1);
11791 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011792 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011793 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11794 result = NULL;
11795 else
11796 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797
11798 Py_DECREF(str1);
11799 Py_DECREF(str2);
11800 return result;
11801}
11802
Alexander Belopolsky40018472011-02-26 01:02:56 +000011803static PyObject *
11804unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011806 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 Py_ssize_t isize;
11808 Py_ssize_t osize, squote, dquote, i, o;
11809 Py_UCS4 max, quote;
11810 int ikind, okind;
11811 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011814 return NULL;
11815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 isize = PyUnicode_GET_LENGTH(unicode);
11817 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 /* Compute length of output, quote characters, and
11820 maximum character */
11821 osize = 2; /* quotes */
11822 max = 127;
11823 squote = dquote = 0;
11824 ikind = PyUnicode_KIND(unicode);
11825 for (i = 0; i < isize; i++) {
11826 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11827 switch (ch) {
11828 case '\'': squote++; osize++; break;
11829 case '"': dquote++; osize++; break;
11830 case '\\': case '\t': case '\r': case '\n':
11831 osize += 2; break;
11832 default:
11833 /* Fast-path ASCII */
11834 if (ch < ' ' || ch == 0x7f)
11835 osize += 4; /* \xHH */
11836 else if (ch < 0x7f)
11837 osize++;
11838 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11839 osize++;
11840 max = ch > max ? ch : max;
11841 }
11842 else if (ch < 0x100)
11843 osize += 4; /* \xHH */
11844 else if (ch < 0x10000)
11845 osize += 6; /* \uHHHH */
11846 else
11847 osize += 10; /* \uHHHHHHHH */
11848 }
11849 }
11850
11851 quote = '\'';
11852 if (squote) {
11853 if (dquote)
11854 /* Both squote and dquote present. Use squote,
11855 and escape them */
11856 osize += squote;
11857 else
11858 quote = '"';
11859 }
11860
11861 repr = PyUnicode_New(osize, max);
11862 if (repr == NULL)
11863 return NULL;
11864 okind = PyUnicode_KIND(repr);
11865 odata = PyUnicode_DATA(repr);
11866
11867 PyUnicode_WRITE(okind, odata, 0, quote);
11868 PyUnicode_WRITE(okind, odata, osize-1, quote);
11869
11870 for (i = 0, o = 1; i < isize; i++) {
11871 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011872
11873 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 if ((ch == quote) || (ch == '\\')) {
11875 PyUnicode_WRITE(okind, odata, o++, '\\');
11876 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011877 continue;
11878 }
11879
Benjamin Peterson29060642009-01-31 22:14:21 +000011880 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011881 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 PyUnicode_WRITE(okind, odata, o++, '\\');
11883 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011884 }
11885 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 PyUnicode_WRITE(okind, odata, o++, '\\');
11887 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011888 }
11889 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 PyUnicode_WRITE(okind, odata, o++, '\\');
11891 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011892 }
11893
11894 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011895 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 PyUnicode_WRITE(okind, odata, o++, '\\');
11897 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011898 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11899 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011900 }
11901
Georg Brandl559e5d72008-06-11 18:37:52 +000011902 /* Copy ASCII characters as-is */
11903 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011905 }
11906
Benjamin Peterson29060642009-01-31 22:14:21 +000011907 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011908 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011909 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011910 (categories Z* and C* except ASCII space)
11911 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011913 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000011914 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011917 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11918 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011919 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011920 /* Map 16-bit characters to '\uxxxx' */
11921 else if (ch <= 0xffff) {
11922 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011923 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11924 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11925 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11926 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011927 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011928 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011929 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011930 PyUnicode_WRITE(okind, odata, o++, 'U');
11931 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11932 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11933 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11934 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020011935 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11936 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11937 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11938 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011939 }
11940 }
11941 /* Copy characters as-is */
11942 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011944 }
11945 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011946 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011948 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011949 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950}
11951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011952PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011953 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954\n\
11955Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011956such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957arguments start and end are interpreted as in slice notation.\n\
11958\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011959Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960
11961static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011964 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011965 Py_ssize_t start;
11966 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011967 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968
Jesus Ceaac451502011-04-20 17:09:23 +020011969 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11970 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011971 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 if (PyUnicode_READY(self) == -1)
11974 return NULL;
11975 if (PyUnicode_READY(substring) == -1)
11976 return NULL;
11977
Victor Stinner7931d9a2011-11-04 00:22:48 +010011978 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979
11980 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 if (result == -2)
11983 return NULL;
11984
Christian Heimes217cfd12007-12-02 14:31:20 +000011985 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986}
11987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011988PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011991Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992
11993static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011996 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011997 Py_ssize_t start;
11998 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011999 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000
Jesus Ceaac451502011-04-20 17:09:23 +020012001 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12002 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012003 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 if (PyUnicode_READY(self) == -1)
12006 return NULL;
12007 if (PyUnicode_READY(substring) == -1)
12008 return NULL;
12009
Victor Stinner7931d9a2011-11-04 00:22:48 +010012010 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
12012 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 if (result == -2)
12015 return NULL;
12016
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017 if (result < 0) {
12018 PyErr_SetString(PyExc_ValueError, "substring not found");
12019 return NULL;
12020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021
Christian Heimes217cfd12007-12-02 14:31:20 +000012022 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023}
12024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012025PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012026 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012028Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012029done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030
12031static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012032unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012034 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 Py_UCS4 fillchar = ' ';
12036
Victor Stinnere9a29352011-10-01 02:14:59 +020012037 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012039
Benjamin Petersonbac79492012-01-14 13:34:47 -050012040 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041 return NULL;
12042
Victor Stinnerc4b49542011-12-11 22:44:26 +010012043 if (PyUnicode_GET_LENGTH(self) >= width)
12044 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045
Victor Stinnerc4b49542011-12-11 22:44:26 +010012046 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047}
12048
Alexander Belopolsky40018472011-02-26 01:02:56 +000012049PyObject *
12050PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051{
12052 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012053
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054 s = PyUnicode_FromObject(s);
12055 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012056 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012057 if (sep != NULL) {
12058 sep = PyUnicode_FromObject(sep);
12059 if (sep == NULL) {
12060 Py_DECREF(s);
12061 return NULL;
12062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063 }
12064
Victor Stinner9310abb2011-10-05 00:59:23 +020012065 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066
12067 Py_DECREF(s);
12068 Py_XDECREF(sep);
12069 return result;
12070}
12071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012072PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012073 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074\n\
12075Return a list of the words in S, using sep as the\n\
12076delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012077splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012078whitespace string is a separator and empty strings are\n\
12079removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
12081static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012082unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012084 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012086 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012088 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12089 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090 return NULL;
12091
12092 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012093 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012095 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012097 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098}
12099
Thomas Wouters477c8d52006-05-27 19:21:47 +000012100PyObject *
12101PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12102{
12103 PyObject* str_obj;
12104 PyObject* sep_obj;
12105 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 int kind1, kind2, kind;
12107 void *buf1 = NULL, *buf2 = NULL;
12108 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012109
12110 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012111 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012113 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012114 if (!sep_obj) {
12115 Py_DECREF(str_obj);
12116 return NULL;
12117 }
12118 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12119 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012120 Py_DECREF(str_obj);
12121 return NULL;
12122 }
12123
Victor Stinner14f8f022011-10-05 20:58:25 +020012124 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012126 kind = Py_MAX(kind1, kind2);
12127 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012129 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 if (!buf1)
12131 goto onError;
12132 buf2 = PyUnicode_DATA(sep_obj);
12133 if (kind2 != kind)
12134 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12135 if (!buf2)
12136 goto onError;
12137 len1 = PyUnicode_GET_LENGTH(str_obj);
12138 len2 = PyUnicode_GET_LENGTH(sep_obj);
12139
Benjamin Petersonead6b532011-12-20 17:23:42 -060012140 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012142 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12143 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12144 else
12145 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 break;
12147 case PyUnicode_2BYTE_KIND:
12148 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12149 break;
12150 case PyUnicode_4BYTE_KIND:
12151 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12152 break;
12153 default:
12154 assert(0);
12155 out = 0;
12156 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012157
12158 Py_DECREF(sep_obj);
12159 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 if (kind1 != kind)
12161 PyMem_Free(buf1);
12162 if (kind2 != kind)
12163 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012164
12165 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 onError:
12167 Py_DECREF(sep_obj);
12168 Py_DECREF(str_obj);
12169 if (kind1 != kind && buf1)
12170 PyMem_Free(buf1);
12171 if (kind2 != kind && buf2)
12172 PyMem_Free(buf2);
12173 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012174}
12175
12176
12177PyObject *
12178PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12179{
12180 PyObject* str_obj;
12181 PyObject* sep_obj;
12182 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 int kind1, kind2, kind;
12184 void *buf1 = NULL, *buf2 = NULL;
12185 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012186
12187 str_obj = PyUnicode_FromObject(str_in);
12188 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012189 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012190 sep_obj = PyUnicode_FromObject(sep_in);
12191 if (!sep_obj) {
12192 Py_DECREF(str_obj);
12193 return NULL;
12194 }
12195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 kind1 = PyUnicode_KIND(str_in);
12197 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012198 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 buf1 = PyUnicode_DATA(str_in);
12200 if (kind1 != kind)
12201 buf1 = _PyUnicode_AsKind(str_in, kind);
12202 if (!buf1)
12203 goto onError;
12204 buf2 = PyUnicode_DATA(sep_obj);
12205 if (kind2 != kind)
12206 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12207 if (!buf2)
12208 goto onError;
12209 len1 = PyUnicode_GET_LENGTH(str_obj);
12210 len2 = PyUnicode_GET_LENGTH(sep_obj);
12211
Benjamin Petersonead6b532011-12-20 17:23:42 -060012212 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012214 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12215 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12216 else
12217 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 break;
12219 case PyUnicode_2BYTE_KIND:
12220 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12221 break;
12222 case PyUnicode_4BYTE_KIND:
12223 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12224 break;
12225 default:
12226 assert(0);
12227 out = 0;
12228 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012229
12230 Py_DECREF(sep_obj);
12231 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232 if (kind1 != kind)
12233 PyMem_Free(buf1);
12234 if (kind2 != kind)
12235 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012236
12237 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 onError:
12239 Py_DECREF(sep_obj);
12240 Py_DECREF(str_obj);
12241 if (kind1 != kind && buf1)
12242 PyMem_Free(buf1);
12243 if (kind2 != kind && buf2)
12244 PyMem_Free(buf2);
12245 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012246}
12247
12248PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012249 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012250\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012251Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012252the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012253found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012254
12255static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012256unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012257{
Victor Stinner9310abb2011-10-05 00:59:23 +020012258 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012259}
12260
12261PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012262 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012263\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012264Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012265the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012266separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012267
12268static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012269unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012270{
Victor Stinner9310abb2011-10-05 00:59:23 +020012271 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012272}
12273
Alexander Belopolsky40018472011-02-26 01:02:56 +000012274PyObject *
12275PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012276{
12277 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012278
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012279 s = PyUnicode_FromObject(s);
12280 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012281 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012282 if (sep != NULL) {
12283 sep = PyUnicode_FromObject(sep);
12284 if (sep == NULL) {
12285 Py_DECREF(s);
12286 return NULL;
12287 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012288 }
12289
Victor Stinner9310abb2011-10-05 00:59:23 +020012290 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012291
12292 Py_DECREF(s);
12293 Py_XDECREF(sep);
12294 return result;
12295}
12296
12297PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012298 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012299\n\
12300Return a list of the words in S, using sep as the\n\
12301delimiter string, starting at the end of the string and\n\
12302working to the front. If maxsplit is given, at most maxsplit\n\
12303splits are done. If sep is not specified, any whitespace string\n\
12304is a separator.");
12305
12306static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012307unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012308{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012309 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012310 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012311 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012312
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012313 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12314 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012315 return NULL;
12316
12317 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012318 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012319 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012320 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012321 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012322 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012323}
12324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012325PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012326 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327\n\
12328Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012329Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012330is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331
12332static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012333unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012335 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012336 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012337
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012338 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12339 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340 return NULL;
12341
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012342 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343}
12344
12345static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012346PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012348 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349}
12350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012351PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353\n\
12354Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012355and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356
12357static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012358unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012360 if (PyUnicode_READY(self) == -1)
12361 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012362 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363}
12364
Georg Brandlceee0772007-11-27 23:48:05 +000012365PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012366 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012367\n\
12368Return a translation table usable for str.translate().\n\
12369If there is only one argument, it must be a dictionary mapping Unicode\n\
12370ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012371Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012372If there are two arguments, they must be strings of equal length, and\n\
12373in the resulting dictionary, each character in x will be mapped to the\n\
12374character at the same position in y. If there is a third argument, it\n\
12375must be a string, whose characters will be mapped to None in the result.");
12376
12377static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012378unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012379{
12380 PyObject *x, *y = NULL, *z = NULL;
12381 PyObject *new = NULL, *key, *value;
12382 Py_ssize_t i = 0;
12383 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012384
Georg Brandlceee0772007-11-27 23:48:05 +000012385 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12386 return NULL;
12387 new = PyDict_New();
12388 if (!new)
12389 return NULL;
12390 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 int x_kind, y_kind, z_kind;
12392 void *x_data, *y_data, *z_data;
12393
Georg Brandlceee0772007-11-27 23:48:05 +000012394 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012395 if (!PyUnicode_Check(x)) {
12396 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12397 "be a string if there is a second argument");
12398 goto err;
12399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012401 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12402 "arguments must have equal length");
12403 goto err;
12404 }
12405 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 x_kind = PyUnicode_KIND(x);
12407 y_kind = PyUnicode_KIND(y);
12408 x_data = PyUnicode_DATA(x);
12409 y_data = PyUnicode_DATA(y);
12410 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12411 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012412 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012413 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012414 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012415 if (!value) {
12416 Py_DECREF(key);
12417 goto err;
12418 }
Georg Brandlceee0772007-11-27 23:48:05 +000012419 res = PyDict_SetItem(new, key, value);
12420 Py_DECREF(key);
12421 Py_DECREF(value);
12422 if (res < 0)
12423 goto err;
12424 }
12425 /* create entries for deleting chars in z */
12426 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 z_kind = PyUnicode_KIND(z);
12428 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012429 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012431 if (!key)
12432 goto err;
12433 res = PyDict_SetItem(new, key, Py_None);
12434 Py_DECREF(key);
12435 if (res < 0)
12436 goto err;
12437 }
12438 }
12439 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440 int kind;
12441 void *data;
12442
Georg Brandlceee0772007-11-27 23:48:05 +000012443 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012444 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012445 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12446 "to maketrans it must be a dict");
12447 goto err;
12448 }
12449 /* copy entries into the new dict, converting string keys to int keys */
12450 while (PyDict_Next(x, &i, &key, &value)) {
12451 if (PyUnicode_Check(key)) {
12452 /* convert string keys to integer keys */
12453 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012454 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012455 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12456 "table must be of length 1");
12457 goto err;
12458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 kind = PyUnicode_KIND(key);
12460 data = PyUnicode_DATA(key);
12461 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012462 if (!newkey)
12463 goto err;
12464 res = PyDict_SetItem(new, newkey, value);
12465 Py_DECREF(newkey);
12466 if (res < 0)
12467 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012468 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012469 /* just keep integer keys */
12470 if (PyDict_SetItem(new, key, value) < 0)
12471 goto err;
12472 } else {
12473 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12474 "be strings or integers");
12475 goto err;
12476 }
12477 }
12478 }
12479 return new;
12480 err:
12481 Py_DECREF(new);
12482 return NULL;
12483}
12484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012485PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012486 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487\n\
12488Return a copy of the string S, where all characters have been mapped\n\
12489through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012490Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012491Unmapped characters are left untouched. Characters mapped to None\n\
12492are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493
12494static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012497 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498}
12499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012500PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012501 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012503Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504
12505static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012506unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012508 if (PyUnicode_READY(self) == -1)
12509 return NULL;
12510 if (PyUnicode_IS_ASCII(self))
12511 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012512 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513}
12514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012515PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012516 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012518Pad a numeric string S with zeros on the left, to fill a field\n\
12519of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520
12521static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012522unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012524 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012525 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012526 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 int kind;
12528 void *data;
12529 Py_UCS4 chr;
12530
Martin v. Löwis18e16552006-02-15 17:27:45 +000012531 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532 return NULL;
12533
Benjamin Petersonbac79492012-01-14 13:34:47 -050012534 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012535 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536
Victor Stinnerc4b49542011-12-11 22:44:26 +010012537 if (PyUnicode_GET_LENGTH(self) >= width)
12538 return unicode_result_unchanged(self);
12539
12540 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541
12542 u = pad(self, fill, 0, '0');
12543
Walter Dörwald068325e2002-04-15 13:36:47 +000012544 if (u == NULL)
12545 return NULL;
12546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 kind = PyUnicode_KIND(u);
12548 data = PyUnicode_DATA(u);
12549 chr = PyUnicode_READ(kind, data, fill);
12550
12551 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 PyUnicode_WRITE(kind, data, 0, chr);
12554 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555 }
12556
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012557 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012558 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560
12561#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012562static PyObject *
12563unicode__decimal2ascii(PyObject *self)
12564{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012566}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567#endif
12568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012569PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012570 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012572Return True if S starts with the specified prefix, False otherwise.\n\
12573With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012574With optional end, stop comparing S at that position.\n\
12575prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576
12577static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012578unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012579 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012581 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012582 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012583 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012584 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012585 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586
Jesus Ceaac451502011-04-20 17:09:23 +020012587 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012588 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012589 if (PyTuple_Check(subobj)) {
12590 Py_ssize_t i;
12591 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012592 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012593 if (substring == NULL)
12594 return NULL;
12595 result = tailmatch(self, substring, start, end, -1);
12596 Py_DECREF(substring);
12597 if (result) {
12598 Py_RETURN_TRUE;
12599 }
12600 }
12601 /* nothing matched */
12602 Py_RETURN_FALSE;
12603 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012604 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012605 if (substring == NULL) {
12606 if (PyErr_ExceptionMatches(PyExc_TypeError))
12607 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12608 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012609 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012610 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012611 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012613 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614}
12615
12616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012617PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012618 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012620Return True if S ends with the specified suffix, False otherwise.\n\
12621With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012622With optional end, stop comparing S at that position.\n\
12623suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624
12625static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012626unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012627 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012629 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012630 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012631 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012632 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012633 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634
Jesus Ceaac451502011-04-20 17:09:23 +020012635 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012636 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012637 if (PyTuple_Check(subobj)) {
12638 Py_ssize_t i;
12639 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012640 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012641 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012642 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012643 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012644 result = tailmatch(self, substring, start, end, +1);
12645 Py_DECREF(substring);
12646 if (result) {
12647 Py_RETURN_TRUE;
12648 }
12649 }
12650 Py_RETURN_FALSE;
12651 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012652 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012653 if (substring == NULL) {
12654 if (PyErr_ExceptionMatches(PyExc_TypeError))
12655 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12656 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012657 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012658 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012659 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012661 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662}
12663
Victor Stinner202fdca2012-05-07 12:47:02 +020012664Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012665_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012666{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012667 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012668 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12669 writer->data = PyUnicode_DATA(writer->buffer);
12670 writer->kind = PyUnicode_KIND(writer->buffer);
12671}
12672
Victor Stinnerd3f08822012-05-29 12:57:52 +020012673void
12674_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012675{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012676 memset(writer, 0, sizeof(*writer));
12677#ifdef Py_DEBUG
12678 writer->kind = 5; /* invalid kind */
12679#endif
12680 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012681 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012682}
12683
Victor Stinnerd3f08822012-05-29 12:57:52 +020012684int
12685_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12686 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012687{
12688 Py_ssize_t newlen;
12689 PyObject *newbuffer;
12690
Victor Stinnerd3f08822012-05-29 12:57:52 +020012691 assert(length > 0);
12692
Victor Stinner202fdca2012-05-07 12:47:02 +020012693 if (length > PY_SSIZE_T_MAX - writer->pos) {
12694 PyErr_NoMemory();
12695 return -1;
12696 }
12697 newlen = writer->pos + length;
12698
Victor Stinnerd3f08822012-05-29 12:57:52 +020012699 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012700 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012701 /* overallocate 25% to limit the number of resize */
12702 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12703 newlen += newlen / 4;
12704 if (newlen < writer->min_length)
12705 newlen = writer->min_length;
12706 }
12707 writer->buffer = PyUnicode_New(newlen, maxchar);
12708 if (writer->buffer == NULL)
12709 return -1;
12710 _PyUnicodeWriter_Update(writer);
12711 return 0;
12712 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012713
Victor Stinnerd3f08822012-05-29 12:57:52 +020012714 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012715 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012716 /* overallocate 25% to limit the number of resize */
12717 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12718 newlen += newlen / 4;
12719 if (newlen < writer->min_length)
12720 newlen = writer->min_length;
12721 }
12722
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012723 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012724 /* resize + widen */
12725 newbuffer = PyUnicode_New(newlen, maxchar);
12726 if (newbuffer == NULL)
12727 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012728 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12729 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012730 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012731 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012732 }
12733 else {
12734 newbuffer = resize_compact(writer->buffer, newlen);
12735 if (newbuffer == NULL)
12736 return -1;
12737 }
12738 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012739 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012740 }
12741 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012742 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012743 newbuffer = PyUnicode_New(writer->size, maxchar);
12744 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012745 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012746 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12747 writer->buffer, 0, writer->pos);
12748 Py_DECREF(writer->buffer);
12749 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012750 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012751 }
12752 return 0;
12753}
12754
Victor Stinnerd3f08822012-05-29 12:57:52 +020012755int
12756_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12757{
12758 Py_UCS4 maxchar;
12759 Py_ssize_t len;
12760
12761 if (PyUnicode_READY(str) == -1)
12762 return -1;
12763 len = PyUnicode_GET_LENGTH(str);
12764 if (len == 0)
12765 return 0;
12766 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12767 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012768 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012769 Py_INCREF(str);
12770 writer->buffer = str;
12771 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012772 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012773 writer->size = 0;
12774 writer->pos += len;
12775 return 0;
12776 }
12777 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12778 return -1;
12779 }
12780 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12781 str, 0, len);
12782 writer->pos += len;
12783 return 0;
12784}
12785
Victor Stinnere215d962012-10-06 23:03:36 +020012786int
12787_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12788{
12789 Py_UCS4 maxchar;
12790
12791 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12792 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12793 return -1;
12794 unicode_write_cstr(writer->buffer, writer->pos, str, len);
12795 writer->pos += len;
12796 return 0;
12797}
12798
Victor Stinnerd3f08822012-05-29 12:57:52 +020012799PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012800_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012801{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012802 if (writer->pos == 0) {
12803 Py_XDECREF(writer->buffer);
12804 Py_INCREF(unicode_empty);
12805 return unicode_empty;
12806 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012807 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012808 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12809 return writer->buffer;
12810 }
12811 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12812 PyObject *newbuffer;
12813 newbuffer = resize_compact(writer->buffer, writer->pos);
12814 if (newbuffer == NULL) {
12815 Py_DECREF(writer->buffer);
12816 return NULL;
12817 }
12818 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012819 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012820 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012821 return writer->buffer;
12822}
12823
Victor Stinnerd3f08822012-05-29 12:57:52 +020012824void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012825_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012826{
12827 Py_CLEAR(writer->buffer);
12828}
12829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012831
12832PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012833 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012834\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012835Return a formatted version of S, using substitutions from args and kwargs.\n\
12836The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012837
Eric Smith27bbca62010-11-04 17:06:58 +000012838PyDoc_STRVAR(format_map__doc__,
12839 "S.format_map(mapping) -> str\n\
12840\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012841Return a formatted version of S, using substitutions from mapping.\n\
12842The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012843
Eric Smith4a7d76d2008-05-30 18:10:19 +000012844static PyObject *
12845unicode__format__(PyObject* self, PyObject* args)
12846{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012847 PyObject *format_spec;
12848 _PyUnicodeWriter writer;
12849 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012850
12851 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12852 return NULL;
12853
Victor Stinnerd3f08822012-05-29 12:57:52 +020012854 if (PyUnicode_READY(self) == -1)
12855 return NULL;
12856 _PyUnicodeWriter_Init(&writer, 0);
12857 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12858 self, format_spec, 0,
12859 PyUnicode_GET_LENGTH(format_spec));
12860 if (ret == -1) {
12861 _PyUnicodeWriter_Dealloc(&writer);
12862 return NULL;
12863 }
12864 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012865}
12866
Eric Smith8c663262007-08-25 02:26:07 +000012867PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012868 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012869\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012870Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012871
12872static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012873unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012874{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875 Py_ssize_t size;
12876
12877 /* If it's a compact object, account for base structure +
12878 character data. */
12879 if (PyUnicode_IS_COMPACT_ASCII(v))
12880 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12881 else if (PyUnicode_IS_COMPACT(v))
12882 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012883 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012884 else {
12885 /* If it is a two-block object, account for base object, and
12886 for character block if present. */
12887 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012888 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012889 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012890 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891 }
12892 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012893 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012894 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012895 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012896 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012897 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898
12899 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012900}
12901
12902PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012903 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012904
12905static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012906unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012907{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012908 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 if (!copy)
12910 return NULL;
12911 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012912}
12913
Guido van Rossumd57fd912000-03-10 22:53:23 +000012914static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012915 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012916 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012917 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12918 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012919 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12920 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050012921 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012922 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12923 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12924 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12925 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12926 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012927 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012928 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12929 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12930 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012931 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012932 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12933 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12934 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012935 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012936 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012937 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012938 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012939 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12940 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12941 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12942 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12943 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12944 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12945 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12946 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12947 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12948 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12949 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12950 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12951 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12952 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012953 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012954 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012955 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012956 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012957 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012958 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012959 {"maketrans", (PyCFunction) unicode_maketrans,
12960 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012961 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012962#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012963 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012964 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965#endif
12966
Benjamin Peterson14339b62009-01-31 16:36:08 +000012967 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012968 {NULL, NULL}
12969};
12970
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012971static PyObject *
12972unicode_mod(PyObject *v, PyObject *w)
12973{
Brian Curtindfc80e32011-08-10 20:28:54 -050012974 if (!PyUnicode_Check(v))
12975 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012976 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012977}
12978
12979static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012980 0, /*nb_add*/
12981 0, /*nb_subtract*/
12982 0, /*nb_multiply*/
12983 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012984};
12985
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012987 (lenfunc) unicode_length, /* sq_length */
12988 PyUnicode_Concat, /* sq_concat */
12989 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12990 (ssizeargfunc) unicode_getitem, /* sq_item */
12991 0, /* sq_slice */
12992 0, /* sq_ass_item */
12993 0, /* sq_ass_slice */
12994 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012995};
12996
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012997static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012998unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 if (PyUnicode_READY(self) == -1)
13001 return NULL;
13002
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013003 if (PyIndex_Check(item)) {
13004 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013005 if (i == -1 && PyErr_Occurred())
13006 return NULL;
13007 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013009 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013010 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013011 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013012 PyObject *result;
13013 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013014 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013015 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013018 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013019 return NULL;
13020 }
13021
13022 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013023 Py_INCREF(unicode_empty);
13024 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013026 slicelength == PyUnicode_GET_LENGTH(self)) {
13027 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013028 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013029 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013030 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013031 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013032 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013033 src_kind = PyUnicode_KIND(self);
13034 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013035 if (!PyUnicode_IS_ASCII(self)) {
13036 kind_limit = kind_maxchar_limit(src_kind);
13037 max_char = 0;
13038 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13039 ch = PyUnicode_READ(src_kind, src_data, cur);
13040 if (ch > max_char) {
13041 max_char = ch;
13042 if (max_char >= kind_limit)
13043 break;
13044 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013045 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013046 }
Victor Stinner55c99112011-10-13 01:17:06 +020013047 else
13048 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013049 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013050 if (result == NULL)
13051 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013052 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013053 dest_data = PyUnicode_DATA(result);
13054
13055 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013056 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13057 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013058 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013059 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013060 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013061 } else {
13062 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13063 return NULL;
13064 }
13065}
13066
13067static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013068 (lenfunc)unicode_length, /* mp_length */
13069 (binaryfunc)unicode_subscript, /* mp_subscript */
13070 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013071};
13072
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074/* Helpers for PyUnicode_Format() */
13075
Victor Stinnera47082312012-10-04 02:19:54 +020013076struct unicode_formatter_t {
13077 PyObject *args;
13078 int args_owned;
13079 Py_ssize_t arglen, argidx;
13080 PyObject *dict;
13081
13082 enum PyUnicode_Kind fmtkind;
13083 Py_ssize_t fmtcnt, fmtpos;
13084 void *fmtdata;
13085 PyObject *fmtstr;
13086
13087 _PyUnicodeWriter writer;
13088};
13089
13090struct unicode_format_arg_t {
13091 Py_UCS4 ch;
13092 int flags;
13093 Py_ssize_t width;
13094 int prec;
13095 int sign;
13096};
13097
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013099unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100{
Victor Stinnera47082312012-10-04 02:19:54 +020013101 Py_ssize_t argidx = ctx->argidx;
13102
13103 if (argidx < ctx->arglen) {
13104 ctx->argidx++;
13105 if (ctx->arglen < 0)
13106 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013107 else
Victor Stinnera47082312012-10-04 02:19:54 +020013108 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109 }
13110 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013111 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112 return NULL;
13113}
13114
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013115/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116
Victor Stinnera47082312012-10-04 02:19:54 +020013117/* Format a float into the writer if the writer is not NULL, or into *p_output
13118 otherwise.
13119
13120 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013121static int
Victor Stinnera47082312012-10-04 02:19:54 +020013122formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13123 PyObject **p_output,
13124 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013126 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013128 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013129 int prec;
13130 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013131
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132 x = PyFloat_AsDouble(v);
13133 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013134 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013135
Victor Stinnera47082312012-10-04 02:19:54 +020013136 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013138 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013139
Victor Stinnera47082312012-10-04 02:19:54 +020013140 if (arg->flags & F_ALT)
13141 dtoa_flags = Py_DTSF_ALT;
13142 else
13143 dtoa_flags = 0;
13144 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013145 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013146 return -1;
13147 len = strlen(p);
13148 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013149 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13150 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013151 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013152 }
Victor Stinner184252a2012-06-16 02:57:41 +020013153 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013154 writer->pos += len;
13155 }
13156 else
13157 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013158 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013159 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013160}
13161
Victor Stinnerd0880d52012-04-27 23:40:13 +020013162/* formatlong() emulates the format codes d, u, o, x and X, and
13163 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13164 * Python's regular ints.
13165 * Return value: a new PyUnicodeObject*, or NULL if error.
13166 * The output string is of the form
13167 * "-"? ("0x" | "0X")? digit+
13168 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13169 * set in flags. The case of hex digits will be correct,
13170 * There will be at least prec digits, zero-filled on the left if
13171 * necessary to get that many.
13172 * val object to be converted
13173 * flags bitmask of format flags; only F_ALT is looked at
13174 * prec minimum number of digits; 0-fill on left if needed
13175 * type a character in [duoxX]; u acts the same as d
13176 *
13177 * CAUTION: o, x and X conversions on regular ints can never
13178 * produce a '-' sign, but can for Python's unbounded ints.
13179 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013180static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013181formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013182{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013183 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013184 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013185 Py_ssize_t i;
13186 int sign; /* 1 if '-', else 0 */
13187 int len; /* number of characters */
13188 Py_ssize_t llen;
13189 int numdigits; /* len == numnondigits + numdigits */
13190 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013191 int prec = arg->prec;
13192 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013193
Victor Stinnerd0880d52012-04-27 23:40:13 +020013194 /* Avoid exceeding SSIZE_T_MAX */
13195 if (prec > INT_MAX-3) {
13196 PyErr_SetString(PyExc_OverflowError,
13197 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013198 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013199 }
13200
13201 assert(PyLong_Check(val));
13202
13203 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013204 default:
13205 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013206 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013207 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013208 case 'u':
13209 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013210 if (PyBool_Check(val))
13211 result = PyNumber_ToBase(val, 10);
13212 else
13213 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013214 break;
13215 case 'o':
13216 numnondigits = 2;
13217 result = PyNumber_ToBase(val, 8);
13218 break;
13219 case 'x':
13220 case 'X':
13221 numnondigits = 2;
13222 result = PyNumber_ToBase(val, 16);
13223 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013224 }
13225 if (!result)
13226 return NULL;
13227
13228 assert(unicode_modifiable(result));
13229 assert(PyUnicode_IS_READY(result));
13230 assert(PyUnicode_IS_ASCII(result));
13231
13232 /* To modify the string in-place, there can only be one reference. */
13233 if (Py_REFCNT(result) != 1) {
13234 PyErr_BadInternalCall();
13235 return NULL;
13236 }
13237 buf = PyUnicode_DATA(result);
13238 llen = PyUnicode_GET_LENGTH(result);
13239 if (llen > INT_MAX) {
13240 PyErr_SetString(PyExc_ValueError,
13241 "string too large in _PyBytes_FormatLong");
13242 return NULL;
13243 }
13244 len = (int)llen;
13245 sign = buf[0] == '-';
13246 numnondigits += sign;
13247 numdigits = len - numnondigits;
13248 assert(numdigits > 0);
13249
13250 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013251 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013252 (type == 'o' || type == 'x' || type == 'X'))) {
13253 assert(buf[sign] == '0');
13254 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13255 buf[sign+1] == 'o');
13256 numnondigits -= 2;
13257 buf += 2;
13258 len -= 2;
13259 if (sign)
13260 buf[0] = '-';
13261 assert(len == numnondigits + numdigits);
13262 assert(numdigits > 0);
13263 }
13264
13265 /* Fill with leading zeroes to meet minimum width. */
13266 if (prec > numdigits) {
13267 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13268 numnondigits + prec);
13269 char *b1;
13270 if (!r1) {
13271 Py_DECREF(result);
13272 return NULL;
13273 }
13274 b1 = PyBytes_AS_STRING(r1);
13275 for (i = 0; i < numnondigits; ++i)
13276 *b1++ = *buf++;
13277 for (i = 0; i < prec - numdigits; i++)
13278 *b1++ = '0';
13279 for (i = 0; i < numdigits; i++)
13280 *b1++ = *buf++;
13281 *b1 = '\0';
13282 Py_DECREF(result);
13283 result = r1;
13284 buf = PyBytes_AS_STRING(result);
13285 len = numnondigits + prec;
13286 }
13287
13288 /* Fix up case for hex conversions. */
13289 if (type == 'X') {
13290 /* Need to convert all lower case letters to upper case.
13291 and need to convert 0x to 0X (and -0x to -0X). */
13292 for (i = 0; i < len; i++)
13293 if (buf[i] >= 'a' && buf[i] <= 'x')
13294 buf[i] -= 'a'-'A';
13295 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013296 if (!PyUnicode_Check(result)
13297 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013298 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013299 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013300 Py_DECREF(result);
13301 result = unicode;
13302 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013303 else if (len != PyUnicode_GET_LENGTH(result)) {
13304 if (PyUnicode_Resize(&result, len) < 0)
13305 Py_CLEAR(result);
13306 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013307 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013308}
13309
Victor Stinner621ef3d2012-10-02 00:33:47 +020013310/* Format an integer.
13311 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013312 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013313 * -1 and raise an exception on error */
13314static int
Victor Stinnera47082312012-10-04 02:19:54 +020013315mainformatlong(PyObject *v,
13316 struct unicode_format_arg_t *arg,
13317 PyObject **p_output,
13318 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013319{
13320 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013321 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013322
13323 if (!PyNumber_Check(v))
13324 goto wrongtype;
13325
13326 if (!PyLong_Check(v)) {
13327 iobj = PyNumber_Long(v);
13328 if (iobj == NULL) {
13329 if (PyErr_ExceptionMatches(PyExc_TypeError))
13330 goto wrongtype;
13331 return -1;
13332 }
13333 assert(PyLong_Check(iobj));
13334 }
13335 else {
13336 iobj = v;
13337 Py_INCREF(iobj);
13338 }
13339
13340 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013341 && arg->width == -1 && arg->prec == -1
13342 && !(arg->flags & (F_SIGN | F_BLANK))
13343 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013344 {
13345 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013346 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013347 int base;
13348
Victor Stinnera47082312012-10-04 02:19:54 +020013349 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013350 {
13351 default:
13352 assert(0 && "'type' not in [diuoxX]");
13353 case 'd':
13354 case 'i':
13355 case 'u':
13356 base = 10;
13357 break;
13358 case 'o':
13359 base = 8;
13360 break;
13361 case 'x':
13362 case 'X':
13363 base = 16;
13364 break;
13365 }
13366
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013367 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13368 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013369 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013370 }
13371 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013372 return 1;
13373 }
13374
Victor Stinnera47082312012-10-04 02:19:54 +020013375 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013376 Py_DECREF(iobj);
13377 if (res == NULL)
13378 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013379 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013380 return 0;
13381
13382wrongtype:
13383 PyErr_Format(PyExc_TypeError,
13384 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013385 "not %.200s",
13386 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013387 return -1;
13388}
13389
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013390static Py_UCS4
13391formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013392{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013393 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013394 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013395 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013396 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013397 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013398 goto onError;
13399 }
13400 else {
13401 /* Integer input truncated to a character */
13402 long x;
13403 x = PyLong_AsLong(v);
13404 if (x == -1 && PyErr_Occurred())
13405 goto onError;
13406
Victor Stinner8faf8212011-12-08 22:14:11 +010013407 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 PyErr_SetString(PyExc_OverflowError,
13409 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013410 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 }
13412
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013413 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013414 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013415
Benjamin Peterson29060642009-01-31 22:14:21 +000013416 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013417 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013418 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013419 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420}
13421
Victor Stinnera47082312012-10-04 02:19:54 +020013422/* Parse options of an argument: flags, width, precision.
13423 Handle also "%(name)" syntax.
13424
13425 Return 0 if the argument has been formatted into arg->str.
13426 Return 1 if the argument has been written into ctx->writer,
13427 Raise an exception and return -1 on error. */
13428static int
13429unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13430 struct unicode_format_arg_t *arg)
13431{
13432#define FORMAT_READ(ctx) \
13433 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13434
13435 PyObject *v;
13436
13437 arg->ch = FORMAT_READ(ctx);
13438 if (arg->ch == '(') {
13439 /* Get argument value from a dictionary. Example: "%(name)s". */
13440 Py_ssize_t keystart;
13441 Py_ssize_t keylen;
13442 PyObject *key;
13443 int pcount = 1;
13444
13445 if (ctx->dict == NULL) {
13446 PyErr_SetString(PyExc_TypeError,
13447 "format requires a mapping");
13448 return -1;
13449 }
13450 ++ctx->fmtpos;
13451 --ctx->fmtcnt;
13452 keystart = ctx->fmtpos;
13453 /* Skip over balanced parentheses */
13454 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13455 arg->ch = FORMAT_READ(ctx);
13456 if (arg->ch == ')')
13457 --pcount;
13458 else if (arg->ch == '(')
13459 ++pcount;
13460 ctx->fmtpos++;
13461 }
13462 keylen = ctx->fmtpos - keystart - 1;
13463 if (ctx->fmtcnt < 0 || pcount > 0) {
13464 PyErr_SetString(PyExc_ValueError,
13465 "incomplete format key");
13466 return -1;
13467 }
13468 key = PyUnicode_Substring(ctx->fmtstr,
13469 keystart, keystart + keylen);
13470 if (key == NULL)
13471 return -1;
13472 if (ctx->args_owned) {
13473 Py_DECREF(ctx->args);
13474 ctx->args_owned = 0;
13475 }
13476 ctx->args = PyObject_GetItem(ctx->dict, key);
13477 Py_DECREF(key);
13478 if (ctx->args == NULL)
13479 return -1;
13480 ctx->args_owned = 1;
13481 ctx->arglen = -1;
13482 ctx->argidx = -2;
13483 }
13484
13485 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
13486 arg->flags = 0;
13487 while (--ctx->fmtcnt >= 0) {
13488 arg->ch = FORMAT_READ(ctx);
13489 ctx->fmtpos++;
13490 switch (arg->ch) {
13491 case '-': arg->flags |= F_LJUST; continue;
13492 case '+': arg->flags |= F_SIGN; continue;
13493 case ' ': arg->flags |= F_BLANK; continue;
13494 case '#': arg->flags |= F_ALT; continue;
13495 case '0': arg->flags |= F_ZERO; continue;
13496 }
13497 break;
13498 }
13499
13500 /* Parse width. Example: "%10s" => width=10 */
13501 arg->width = -1;
13502 if (arg->ch == '*') {
13503 v = unicode_format_getnextarg(ctx);
13504 if (v == NULL)
13505 return -1;
13506 if (!PyLong_Check(v)) {
13507 PyErr_SetString(PyExc_TypeError,
13508 "* wants int");
13509 return -1;
13510 }
13511 arg->width = PyLong_AsLong(v);
13512 if (arg->width == -1 && PyErr_Occurred())
13513 return -1;
13514 if (arg->width < 0) {
13515 arg->flags |= F_LJUST;
13516 arg->width = -arg->width;
13517 }
13518 if (--ctx->fmtcnt >= 0) {
13519 arg->ch = FORMAT_READ(ctx);
13520 ctx->fmtpos++;
13521 }
13522 }
13523 else if (arg->ch >= '0' && arg->ch <= '9') {
13524 arg->width = arg->ch - '0';
13525 while (--ctx->fmtcnt >= 0) {
13526 arg->ch = FORMAT_READ(ctx);
13527 ctx->fmtpos++;
13528 if (arg->ch < '0' || arg->ch > '9')
13529 break;
13530 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13531 mixing signed and unsigned comparison. Since arg->ch is between
13532 '0' and '9', casting to int is safe. */
13533 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13534 PyErr_SetString(PyExc_ValueError,
13535 "width too big");
13536 return -1;
13537 }
13538 arg->width = arg->width*10 + (arg->ch - '0');
13539 }
13540 }
13541
13542 /* Parse precision. Example: "%.3f" => prec=3 */
13543 arg->prec = -1;
13544 if (arg->ch == '.') {
13545 arg->prec = 0;
13546 if (--ctx->fmtcnt >= 0) {
13547 arg->ch = FORMAT_READ(ctx);
13548 ctx->fmtpos++;
13549 }
13550 if (arg->ch == '*') {
13551 v = unicode_format_getnextarg(ctx);
13552 if (v == NULL)
13553 return -1;
13554 if (!PyLong_Check(v)) {
13555 PyErr_SetString(PyExc_TypeError,
13556 "* wants int");
13557 return -1;
13558 }
13559 arg->prec = PyLong_AsLong(v);
13560 if (arg->prec == -1 && PyErr_Occurred())
13561 return -1;
13562 if (arg->prec < 0)
13563 arg->prec = 0;
13564 if (--ctx->fmtcnt >= 0) {
13565 arg->ch = FORMAT_READ(ctx);
13566 ctx->fmtpos++;
13567 }
13568 }
13569 else if (arg->ch >= '0' && arg->ch <= '9') {
13570 arg->prec = arg->ch - '0';
13571 while (--ctx->fmtcnt >= 0) {
13572 arg->ch = FORMAT_READ(ctx);
13573 ctx->fmtpos++;
13574 if (arg->ch < '0' || arg->ch > '9')
13575 break;
13576 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13577 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013578 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013579 return -1;
13580 }
13581 arg->prec = arg->prec*10 + (arg->ch - '0');
13582 }
13583 }
13584 }
13585
13586 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13587 if (ctx->fmtcnt >= 0) {
13588 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13589 if (--ctx->fmtcnt >= 0) {
13590 arg->ch = FORMAT_READ(ctx);
13591 ctx->fmtpos++;
13592 }
13593 }
13594 }
13595 if (ctx->fmtcnt < 0) {
13596 PyErr_SetString(PyExc_ValueError,
13597 "incomplete format");
13598 return -1;
13599 }
13600 return 0;
13601
13602#undef FORMAT_READ
13603}
13604
13605/* Format one argument. Supported conversion specifiers:
13606
13607 - "s", "r", "a": any type
13608 - "i", "d", "u", "o", "x", "X": int
13609 - "e", "E", "f", "F", "g", "G": float
13610 - "c": int or str (1 character)
13611
13612 Return 0 if the argument has been formatted into *p_str,
13613 1 if the argument has been written into ctx->writer,
13614 -1 on error. */
13615static int
13616unicode_format_arg_format(struct unicode_formatter_t *ctx,
13617 struct unicode_format_arg_t *arg,
13618 PyObject **p_str)
13619{
13620 PyObject *v;
13621 _PyUnicodeWriter *writer = &ctx->writer;
13622
13623 if (ctx->fmtcnt == 0)
13624 ctx->writer.overallocate = 0;
13625
13626 if (arg->ch == '%') {
13627 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13628 return -1;
13629 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13630 writer->pos += 1;
13631 return 1;
13632 }
13633
13634 v = unicode_format_getnextarg(ctx);
13635 if (v == NULL)
13636 return -1;
13637
13638 arg->sign = 0;
13639
13640 switch (arg->ch) {
13641
13642 case 's':
13643 case 'r':
13644 case 'a':
13645 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13646 /* Fast path */
13647 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13648 return -1;
13649 return 1;
13650 }
13651
13652 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13653 *p_str = v;
13654 Py_INCREF(*p_str);
13655 }
13656 else {
13657 if (arg->ch == 's')
13658 *p_str = PyObject_Str(v);
13659 else if (arg->ch == 'r')
13660 *p_str = PyObject_Repr(v);
13661 else
13662 *p_str = PyObject_ASCII(v);
13663 }
13664 break;
13665
13666 case 'i':
13667 case 'd':
13668 case 'u':
13669 case 'o':
13670 case 'x':
13671 case 'X':
13672 {
13673 int ret = mainformatlong(v, arg, p_str, writer);
13674 if (ret != 0)
13675 return ret;
13676 arg->sign = 1;
13677 break;
13678 }
13679
13680 case 'e':
13681 case 'E':
13682 case 'f':
13683 case 'F':
13684 case 'g':
13685 case 'G':
13686 if (arg->width == -1 && arg->prec == -1
13687 && !(arg->flags & (F_SIGN | F_BLANK)))
13688 {
13689 /* Fast path */
13690 if (formatfloat(v, arg, NULL, writer) == -1)
13691 return -1;
13692 return 1;
13693 }
13694
13695 arg->sign = 1;
13696 if (formatfloat(v, arg, p_str, NULL) == -1)
13697 return -1;
13698 break;
13699
13700 case 'c':
13701 {
13702 Py_UCS4 ch = formatchar(v);
13703 if (ch == (Py_UCS4) -1)
13704 return -1;
13705 if (arg->width == -1 && arg->prec == -1) {
13706 /* Fast path */
13707 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13708 return -1;
13709 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13710 writer->pos += 1;
13711 return 1;
13712 }
13713 *p_str = PyUnicode_FromOrdinal(ch);
13714 break;
13715 }
13716
13717 default:
13718 PyErr_Format(PyExc_ValueError,
13719 "unsupported format character '%c' (0x%x) "
13720 "at index %zd",
13721 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13722 (int)arg->ch,
13723 ctx->fmtpos - 1);
13724 return -1;
13725 }
13726 if (*p_str == NULL)
13727 return -1;
13728 assert (PyUnicode_Check(*p_str));
13729 return 0;
13730}
13731
13732static int
13733unicode_format_arg_output(struct unicode_formatter_t *ctx,
13734 struct unicode_format_arg_t *arg,
13735 PyObject *str)
13736{
13737 Py_ssize_t len;
13738 enum PyUnicode_Kind kind;
13739 void *pbuf;
13740 Py_ssize_t pindex;
13741 Py_UCS4 signchar;
13742 Py_ssize_t buflen;
13743 Py_UCS4 maxchar, bufmaxchar;
13744 Py_ssize_t sublen;
13745 _PyUnicodeWriter *writer = &ctx->writer;
13746 Py_UCS4 fill;
13747
13748 fill = ' ';
13749 if (arg->sign && arg->flags & F_ZERO)
13750 fill = '0';
13751
13752 if (PyUnicode_READY(str) == -1)
13753 return -1;
13754
13755 len = PyUnicode_GET_LENGTH(str);
13756 if ((arg->width == -1 || arg->width <= len)
13757 && (arg->prec == -1 || arg->prec >= len)
13758 && !(arg->flags & (F_SIGN | F_BLANK)))
13759 {
13760 /* Fast path */
13761 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13762 return -1;
13763 return 0;
13764 }
13765
13766 /* Truncate the string for "s", "r" and "a" formats
13767 if the precision is set */
13768 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13769 if (arg->prec >= 0 && len > arg->prec)
13770 len = arg->prec;
13771 }
13772
13773 /* Adjust sign and width */
13774 kind = PyUnicode_KIND(str);
13775 pbuf = PyUnicode_DATA(str);
13776 pindex = 0;
13777 signchar = '\0';
13778 if (arg->sign) {
13779 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13780 if (ch == '-' || ch == '+') {
13781 signchar = ch;
13782 len--;
13783 pindex++;
13784 }
13785 else if (arg->flags & F_SIGN)
13786 signchar = '+';
13787 else if (arg->flags & F_BLANK)
13788 signchar = ' ';
13789 else
13790 arg->sign = 0;
13791 }
13792 if (arg->width < len)
13793 arg->width = len;
13794
13795 /* Prepare the writer */
13796 bufmaxchar = 127;
13797 if (!(arg->flags & F_LJUST)) {
13798 if (arg->sign) {
13799 if ((arg->width-1) > len)
13800 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13801 }
13802 else {
13803 if (arg->width > len)
13804 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13805 }
13806 }
13807 maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13808 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13809 buflen = arg->width;
13810 if (arg->sign && len == arg->width)
13811 buflen++;
13812 if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1)
13813 return -1;
13814
13815 /* Write the sign if needed */
13816 if (arg->sign) {
13817 if (fill != ' ') {
13818 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13819 writer->pos += 1;
13820 }
13821 if (arg->width > len)
13822 arg->width--;
13823 }
13824
13825 /* Write the numeric prefix for "x", "X" and "o" formats
13826 if the alternate form is used.
13827 For example, write "0x" for the "%#x" format. */
13828 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13829 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13830 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
13831 if (fill != ' ') {
13832 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13833 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13834 writer->pos += 2;
13835 pindex += 2;
13836 }
13837 arg->width -= 2;
13838 if (arg->width < 0)
13839 arg->width = 0;
13840 len -= 2;
13841 }
13842
13843 /* Pad left with the fill character if needed */
13844 if (arg->width > len && !(arg->flags & F_LJUST)) {
13845 sublen = arg->width - len;
13846 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
13847 writer->pos += sublen;
13848 arg->width = len;
13849 }
13850
13851 /* If padding with spaces: write sign if needed and/or numeric prefix if
13852 the alternate form is used */
13853 if (fill == ' ') {
13854 if (arg->sign) {
13855 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13856 writer->pos += 1;
13857 }
13858 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13859 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13860 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
13861 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13862 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13863 writer->pos += 2;
13864 pindex += 2;
13865 }
13866 }
13867
13868 /* Write characters */
13869 if (len) {
13870 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13871 str, pindex, len);
13872 writer->pos += len;
13873 }
13874
13875 /* Pad right with the fill character if needed */
13876 if (arg->width > len) {
13877 sublen = arg->width - len;
13878 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
13879 writer->pos += sublen;
13880 }
13881 return 0;
13882}
13883
13884/* Helper of PyUnicode_Format(): format one arg.
13885 Return 0 on success, raise an exception and return -1 on error. */
13886static int
13887unicode_format_arg(struct unicode_formatter_t *ctx)
13888{
13889 struct unicode_format_arg_t arg;
13890 PyObject *str;
13891 int ret;
13892
13893 ret = unicode_format_arg_parse(ctx, &arg);
13894 if (ret == -1)
13895 return -1;
13896
13897 ret = unicode_format_arg_format(ctx, &arg, &str);
13898 if (ret == -1)
13899 return -1;
13900
13901 if (ret != 1) {
13902 ret = unicode_format_arg_output(ctx, &arg, str);
13903 Py_DECREF(str);
13904 if (ret == -1)
13905 return -1;
13906 }
13907
13908 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
13909 PyErr_SetString(PyExc_TypeError,
13910 "not all arguments converted during string formatting");
13911 return -1;
13912 }
13913 return 0;
13914}
13915
Alexander Belopolsky40018472011-02-26 01:02:56 +000013916PyObject *
13917PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013918{
Victor Stinnera47082312012-10-04 02:19:54 +020013919 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000013920
Guido van Rossumd57fd912000-03-10 22:53:23 +000013921 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013922 PyErr_BadInternalCall();
13923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013924 }
Victor Stinnera47082312012-10-04 02:19:54 +020013925
13926 ctx.fmtstr = PyUnicode_FromObject(format);
13927 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013928 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020013929 if (PyUnicode_READY(ctx.fmtstr) == -1) {
13930 Py_DECREF(ctx.fmtstr);
13931 return NULL;
13932 }
13933 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
13934 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
13935 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
13936 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013937
Victor Stinnera47082312012-10-04 02:19:54 +020013938 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013939
Guido van Rossumd57fd912000-03-10 22:53:23 +000013940 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020013941 ctx.arglen = PyTuple_Size(args);
13942 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013943 }
13944 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013945 ctx.arglen = -1;
13946 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013947 }
Victor Stinnera47082312012-10-04 02:19:54 +020013948 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013949 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020013950 ctx.dict = args;
13951 else
13952 ctx.dict = NULL;
13953 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013954
Victor Stinnera47082312012-10-04 02:19:54 +020013955 while (--ctx.fmtcnt >= 0) {
13956 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13957 Py_ssize_t nonfmtpos, sublen;
13958 Py_UCS4 maxchar;
13959
13960 nonfmtpos = ctx.fmtpos++;
13961 while (ctx.fmtcnt >= 0 &&
13962 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13963 ctx.fmtpos++;
13964 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013965 }
Victor Stinnera47082312012-10-04 02:19:54 +020013966 if (ctx.fmtcnt < 0) {
13967 ctx.fmtpos--;
13968 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020013969 }
Victor Stinnera47082312012-10-04 02:19:54 +020013970 sublen = ctx.fmtpos - nonfmtpos;
13971 maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr,
Victor Stinneree4544c2012-05-09 22:24:08 +020013972 nonfmtpos, nonfmtpos + sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020013973 if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013974 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013975
Victor Stinnera47082312012-10-04 02:19:54 +020013976 _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos,
13977 ctx.fmtstr, nonfmtpos, sublen);
13978 ctx.writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013979 }
13980 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013981 ctx.fmtpos++;
13982 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013983 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020013984 }
13985 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013986
Victor Stinnera47082312012-10-04 02:19:54 +020013987 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013988 PyErr_SetString(PyExc_TypeError,
13989 "not all arguments converted during string formatting");
13990 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013991 }
13992
Victor Stinnera47082312012-10-04 02:19:54 +020013993 if (ctx.args_owned) {
13994 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013995 }
Victor Stinnera47082312012-10-04 02:19:54 +020013996 Py_DECREF(ctx.fmtstr);
13997 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013998
Benjamin Peterson29060642009-01-31 22:14:21 +000013999 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014000 Py_DECREF(ctx.fmtstr);
14001 _PyUnicodeWriter_Dealloc(&ctx.writer);
14002 if (ctx.args_owned) {
14003 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014004 }
14005 return NULL;
14006}
14007
Jeremy Hylton938ace62002-07-17 16:30:39 +000014008static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014009unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14010
Tim Peters6d6c1a32001-08-02 04:15:00 +000014011static PyObject *
14012unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14013{
Benjamin Peterson29060642009-01-31 22:14:21 +000014014 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014015 static char *kwlist[] = {"object", "encoding", "errors", 0};
14016 char *encoding = NULL;
14017 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014018
Benjamin Peterson14339b62009-01-31 16:36:08 +000014019 if (type != &PyUnicode_Type)
14020 return unicode_subtype_new(type, args, kwds);
14021 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014022 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014023 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014024 if (x == NULL) {
14025 Py_INCREF(unicode_empty);
14026 return unicode_empty;
14027 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014028 if (encoding == NULL && errors == NULL)
14029 return PyObject_Str(x);
14030 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014031 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014032}
14033
Guido van Rossume023fe02001-08-30 03:12:59 +000014034static PyObject *
14035unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14036{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014037 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014038 Py_ssize_t length, char_size;
14039 int share_wstr, share_utf8;
14040 unsigned int kind;
14041 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014042
Benjamin Peterson14339b62009-01-31 16:36:08 +000014043 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014044
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014045 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014046 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014047 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014048 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014049 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014050 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014051 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014052 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014053
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014054 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014055 if (self == NULL) {
14056 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014057 return NULL;
14058 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014059 kind = PyUnicode_KIND(unicode);
14060 length = PyUnicode_GET_LENGTH(unicode);
14061
14062 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014063#ifdef Py_DEBUG
14064 _PyUnicode_HASH(self) = -1;
14065#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014066 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014067#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014068 _PyUnicode_STATE(self).interned = 0;
14069 _PyUnicode_STATE(self).kind = kind;
14070 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014071 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014072 _PyUnicode_STATE(self).ready = 1;
14073 _PyUnicode_WSTR(self) = NULL;
14074 _PyUnicode_UTF8_LENGTH(self) = 0;
14075 _PyUnicode_UTF8(self) = NULL;
14076 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014077 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014078
14079 share_utf8 = 0;
14080 share_wstr = 0;
14081 if (kind == PyUnicode_1BYTE_KIND) {
14082 char_size = 1;
14083 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14084 share_utf8 = 1;
14085 }
14086 else if (kind == PyUnicode_2BYTE_KIND) {
14087 char_size = 2;
14088 if (sizeof(wchar_t) == 2)
14089 share_wstr = 1;
14090 }
14091 else {
14092 assert(kind == PyUnicode_4BYTE_KIND);
14093 char_size = 4;
14094 if (sizeof(wchar_t) == 4)
14095 share_wstr = 1;
14096 }
14097
14098 /* Ensure we won't overflow the length. */
14099 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14100 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014101 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014102 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014103 data = PyObject_MALLOC((length + 1) * char_size);
14104 if (data == NULL) {
14105 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014106 goto onError;
14107 }
14108
Victor Stinnerc3c74152011-10-02 20:39:55 +020014109 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014110 if (share_utf8) {
14111 _PyUnicode_UTF8_LENGTH(self) = length;
14112 _PyUnicode_UTF8(self) = data;
14113 }
14114 if (share_wstr) {
14115 _PyUnicode_WSTR_LENGTH(self) = length;
14116 _PyUnicode_WSTR(self) = (wchar_t *)data;
14117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014118
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014119 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014120 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014121 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014122#ifdef Py_DEBUG
14123 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14124#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014125 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014126 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014127
14128onError:
14129 Py_DECREF(unicode);
14130 Py_DECREF(self);
14131 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014132}
14133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014134PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014135"str(object='') -> str\n\
14136str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014137\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014138Create a new string object from the given object. If encoding or\n\
14139errors is specified, then the object must expose a data buffer\n\
14140that will be decoded using the given encoding and error handler.\n\
14141Otherwise, returns the result of object.__str__() (if defined)\n\
14142or repr(object).\n\
14143encoding defaults to sys.getdefaultencoding().\n\
14144errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014145
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014146static PyObject *unicode_iter(PyObject *seq);
14147
Guido van Rossumd57fd912000-03-10 22:53:23 +000014148PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014149 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014150 "str", /* tp_name */
14151 sizeof(PyUnicodeObject), /* tp_size */
14152 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014153 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014154 (destructor)unicode_dealloc, /* tp_dealloc */
14155 0, /* tp_print */
14156 0, /* tp_getattr */
14157 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014158 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014159 unicode_repr, /* tp_repr */
14160 &unicode_as_number, /* tp_as_number */
14161 &unicode_as_sequence, /* tp_as_sequence */
14162 &unicode_as_mapping, /* tp_as_mapping */
14163 (hashfunc) unicode_hash, /* tp_hash*/
14164 0, /* tp_call*/
14165 (reprfunc) unicode_str, /* tp_str */
14166 PyObject_GenericGetAttr, /* tp_getattro */
14167 0, /* tp_setattro */
14168 0, /* tp_as_buffer */
14169 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014170 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014171 unicode_doc, /* tp_doc */
14172 0, /* tp_traverse */
14173 0, /* tp_clear */
14174 PyUnicode_RichCompare, /* tp_richcompare */
14175 0, /* tp_weaklistoffset */
14176 unicode_iter, /* tp_iter */
14177 0, /* tp_iternext */
14178 unicode_methods, /* tp_methods */
14179 0, /* tp_members */
14180 0, /* tp_getset */
14181 &PyBaseObject_Type, /* tp_base */
14182 0, /* tp_dict */
14183 0, /* tp_descr_get */
14184 0, /* tp_descr_set */
14185 0, /* tp_dictoffset */
14186 0, /* tp_init */
14187 0, /* tp_alloc */
14188 unicode_new, /* tp_new */
14189 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014190};
14191
14192/* Initialize the Unicode implementation */
14193
Victor Stinner3a50e702011-10-18 21:21:00 +020014194int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014195{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014196 int i;
14197
Thomas Wouters477c8d52006-05-27 19:21:47 +000014198 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014199 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014200 0x000A, /* LINE FEED */
14201 0x000D, /* CARRIAGE RETURN */
14202 0x001C, /* FILE SEPARATOR */
14203 0x001D, /* GROUP SEPARATOR */
14204 0x001E, /* RECORD SEPARATOR */
14205 0x0085, /* NEXT LINE */
14206 0x2028, /* LINE SEPARATOR */
14207 0x2029, /* PARAGRAPH SEPARATOR */
14208 };
14209
Fred Drakee4315f52000-05-09 19:53:39 +000014210 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014211 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014212 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014213 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014214 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014215
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014216 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014217 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014218 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014219 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014220
14221 /* initialize the linebreak bloom filter */
14222 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014223 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014224 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014225
14226 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014227
14228#ifdef HAVE_MBCS
14229 winver.dwOSVersionInfoSize = sizeof(winver);
14230 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14231 PyErr_SetFromWindowsErr(0);
14232 return -1;
14233 }
14234#endif
14235 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014236}
14237
14238/* Finalize the Unicode implementation */
14239
Christian Heimesa156e092008-02-16 07:38:31 +000014240int
14241PyUnicode_ClearFreeList(void)
14242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014243 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014244}
14245
Guido van Rossumd57fd912000-03-10 22:53:23 +000014246void
Thomas Wouters78890102000-07-22 19:25:51 +000014247_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014248{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014249 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014250
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014251 Py_XDECREF(unicode_empty);
14252 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014254 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014255 if (unicode_latin1[i]) {
14256 Py_DECREF(unicode_latin1[i]);
14257 unicode_latin1[i] = NULL;
14258 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014259 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014260 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014261 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014262}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014263
Walter Dörwald16807132007-05-25 13:52:07 +000014264void
14265PyUnicode_InternInPlace(PyObject **p)
14266{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014267 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014268 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014269#ifdef Py_DEBUG
14270 assert(s != NULL);
14271 assert(_PyUnicode_CHECK(s));
14272#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014273 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014274 return;
14275#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014276 /* If it's a subclass, we don't really know what putting
14277 it in the interned dict might do. */
14278 if (!PyUnicode_CheckExact(s))
14279 return;
14280 if (PyUnicode_CHECK_INTERNED(s))
14281 return;
14282 if (interned == NULL) {
14283 interned = PyDict_New();
14284 if (interned == NULL) {
14285 PyErr_Clear(); /* Don't leave an exception */
14286 return;
14287 }
14288 }
14289 /* It might be that the GetItem call fails even
14290 though the key is present in the dictionary,
14291 namely when this happens during a stack overflow. */
14292 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014293 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014294 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014295
Benjamin Peterson29060642009-01-31 22:14:21 +000014296 if (t) {
14297 Py_INCREF(t);
14298 Py_DECREF(*p);
14299 *p = t;
14300 return;
14301 }
Walter Dörwald16807132007-05-25 13:52:07 +000014302
Benjamin Peterson14339b62009-01-31 16:36:08 +000014303 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014304 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014305 PyErr_Clear();
14306 PyThreadState_GET()->recursion_critical = 0;
14307 return;
14308 }
14309 PyThreadState_GET()->recursion_critical = 0;
14310 /* The two references in interned are not counted by refcnt.
14311 The deallocator will take care of this */
14312 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014313 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014314}
14315
14316void
14317PyUnicode_InternImmortal(PyObject **p)
14318{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014319 PyUnicode_InternInPlace(p);
14320 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014321 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014322 Py_INCREF(*p);
14323 }
Walter Dörwald16807132007-05-25 13:52:07 +000014324}
14325
14326PyObject *
14327PyUnicode_InternFromString(const char *cp)
14328{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014329 PyObject *s = PyUnicode_FromString(cp);
14330 if (s == NULL)
14331 return NULL;
14332 PyUnicode_InternInPlace(&s);
14333 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014334}
14335
Alexander Belopolsky40018472011-02-26 01:02:56 +000014336void
14337_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014338{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014339 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014340 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014341 Py_ssize_t i, n;
14342 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014343
Benjamin Peterson14339b62009-01-31 16:36:08 +000014344 if (interned == NULL || !PyDict_Check(interned))
14345 return;
14346 keys = PyDict_Keys(interned);
14347 if (keys == NULL || !PyList_Check(keys)) {
14348 PyErr_Clear();
14349 return;
14350 }
Walter Dörwald16807132007-05-25 13:52:07 +000014351
Benjamin Peterson14339b62009-01-31 16:36:08 +000014352 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14353 detector, interned unicode strings are not forcibly deallocated;
14354 rather, we give them their stolen references back, and then clear
14355 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014356
Benjamin Peterson14339b62009-01-31 16:36:08 +000014357 n = PyList_GET_SIZE(keys);
14358 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014359 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014360 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014361 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014362 if (PyUnicode_READY(s) == -1) {
14363 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014364 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014366 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014367 case SSTATE_NOT_INTERNED:
14368 /* XXX Shouldn't happen */
14369 break;
14370 case SSTATE_INTERNED_IMMORTAL:
14371 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014372 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014373 break;
14374 case SSTATE_INTERNED_MORTAL:
14375 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014376 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014377 break;
14378 default:
14379 Py_FatalError("Inconsistent interned string state.");
14380 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014381 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014382 }
14383 fprintf(stderr, "total size of all interned strings: "
14384 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14385 "mortal/immortal\n", mortal_size, immortal_size);
14386 Py_DECREF(keys);
14387 PyDict_Clear(interned);
14388 Py_DECREF(interned);
14389 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014390}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014391
14392
14393/********************* Unicode Iterator **************************/
14394
14395typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014396 PyObject_HEAD
14397 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014398 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014399} unicodeiterobject;
14400
14401static void
14402unicodeiter_dealloc(unicodeiterobject *it)
14403{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014404 _PyObject_GC_UNTRACK(it);
14405 Py_XDECREF(it->it_seq);
14406 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014407}
14408
14409static int
14410unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14411{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014412 Py_VISIT(it->it_seq);
14413 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014414}
14415
14416static PyObject *
14417unicodeiter_next(unicodeiterobject *it)
14418{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014419 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014420
Benjamin Peterson14339b62009-01-31 16:36:08 +000014421 assert(it != NULL);
14422 seq = it->it_seq;
14423 if (seq == NULL)
14424 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014425 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014427 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14428 int kind = PyUnicode_KIND(seq);
14429 void *data = PyUnicode_DATA(seq);
14430 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14431 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014432 if (item != NULL)
14433 ++it->it_index;
14434 return item;
14435 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014436
Benjamin Peterson14339b62009-01-31 16:36:08 +000014437 Py_DECREF(seq);
14438 it->it_seq = NULL;
14439 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014440}
14441
14442static PyObject *
14443unicodeiter_len(unicodeiterobject *it)
14444{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014445 Py_ssize_t len = 0;
14446 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014447 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014448 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014449}
14450
14451PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14452
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014453static PyObject *
14454unicodeiter_reduce(unicodeiterobject *it)
14455{
14456 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014457 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014458 it->it_seq, it->it_index);
14459 } else {
14460 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14461 if (u == NULL)
14462 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014463 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014464 }
14465}
14466
14467PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14468
14469static PyObject *
14470unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14471{
14472 Py_ssize_t index = PyLong_AsSsize_t(state);
14473 if (index == -1 && PyErr_Occurred())
14474 return NULL;
14475 if (index < 0)
14476 index = 0;
14477 it->it_index = index;
14478 Py_RETURN_NONE;
14479}
14480
14481PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14482
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014483static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014484 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014485 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014486 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14487 reduce_doc},
14488 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14489 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014490 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014491};
14492
14493PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014494 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14495 "str_iterator", /* tp_name */
14496 sizeof(unicodeiterobject), /* tp_basicsize */
14497 0, /* tp_itemsize */
14498 /* methods */
14499 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14500 0, /* tp_print */
14501 0, /* tp_getattr */
14502 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014503 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014504 0, /* tp_repr */
14505 0, /* tp_as_number */
14506 0, /* tp_as_sequence */
14507 0, /* tp_as_mapping */
14508 0, /* tp_hash */
14509 0, /* tp_call */
14510 0, /* tp_str */
14511 PyObject_GenericGetAttr, /* tp_getattro */
14512 0, /* tp_setattro */
14513 0, /* tp_as_buffer */
14514 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14515 0, /* tp_doc */
14516 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14517 0, /* tp_clear */
14518 0, /* tp_richcompare */
14519 0, /* tp_weaklistoffset */
14520 PyObject_SelfIter, /* tp_iter */
14521 (iternextfunc)unicodeiter_next, /* tp_iternext */
14522 unicodeiter_methods, /* tp_methods */
14523 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014524};
14525
14526static PyObject *
14527unicode_iter(PyObject *seq)
14528{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014529 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014530
Benjamin Peterson14339b62009-01-31 16:36:08 +000014531 if (!PyUnicode_Check(seq)) {
14532 PyErr_BadInternalCall();
14533 return NULL;
14534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014535 if (PyUnicode_READY(seq) == -1)
14536 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014537 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14538 if (it == NULL)
14539 return NULL;
14540 it->it_index = 0;
14541 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014542 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014543 _PyObject_GC_TRACK(it);
14544 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014545}
14546
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014547
14548size_t
14549Py_UNICODE_strlen(const Py_UNICODE *u)
14550{
14551 int res = 0;
14552 while(*u++)
14553 res++;
14554 return res;
14555}
14556
14557Py_UNICODE*
14558Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14559{
14560 Py_UNICODE *u = s1;
14561 while ((*u++ = *s2++));
14562 return s1;
14563}
14564
14565Py_UNICODE*
14566Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14567{
14568 Py_UNICODE *u = s1;
14569 while ((*u++ = *s2++))
14570 if (n-- == 0)
14571 break;
14572 return s1;
14573}
14574
14575Py_UNICODE*
14576Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14577{
14578 Py_UNICODE *u1 = s1;
14579 u1 += Py_UNICODE_strlen(u1);
14580 Py_UNICODE_strcpy(u1, s2);
14581 return s1;
14582}
14583
14584int
14585Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14586{
14587 while (*s1 && *s2 && *s1 == *s2)
14588 s1++, s2++;
14589 if (*s1 && *s2)
14590 return (*s1 < *s2) ? -1 : +1;
14591 if (*s1)
14592 return 1;
14593 if (*s2)
14594 return -1;
14595 return 0;
14596}
14597
14598int
14599Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14600{
14601 register Py_UNICODE u1, u2;
14602 for (; n != 0; n--) {
14603 u1 = *s1;
14604 u2 = *s2;
14605 if (u1 != u2)
14606 return (u1 < u2) ? -1 : +1;
14607 if (u1 == '\0')
14608 return 0;
14609 s1++;
14610 s2++;
14611 }
14612 return 0;
14613}
14614
14615Py_UNICODE*
14616Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14617{
14618 const Py_UNICODE *p;
14619 for (p = s; *p; p++)
14620 if (*p == c)
14621 return (Py_UNICODE*)p;
14622 return NULL;
14623}
14624
14625Py_UNICODE*
14626Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14627{
14628 const Py_UNICODE *p;
14629 p = s + Py_UNICODE_strlen(s);
14630 while (p != s) {
14631 p--;
14632 if (*p == c)
14633 return (Py_UNICODE*)p;
14634 }
14635 return NULL;
14636}
Victor Stinner331ea922010-08-10 16:37:20 +000014637
Victor Stinner71133ff2010-09-01 23:43:53 +000014638Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014639PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014640{
Victor Stinner577db2c2011-10-11 22:12:48 +020014641 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014642 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014644 if (!PyUnicode_Check(unicode)) {
14645 PyErr_BadArgument();
14646 return NULL;
14647 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014648 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014649 if (u == NULL)
14650 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014651 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014652 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014653 PyErr_NoMemory();
14654 return NULL;
14655 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014656 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014657 size *= sizeof(Py_UNICODE);
14658 copy = PyMem_Malloc(size);
14659 if (copy == NULL) {
14660 PyErr_NoMemory();
14661 return NULL;
14662 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014663 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014664 return copy;
14665}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014666
Georg Brandl66c221e2010-10-14 07:04:07 +000014667/* A _string module, to export formatter_parser and formatter_field_name_split
14668 to the string.Formatter class implemented in Python. */
14669
14670static PyMethodDef _string_methods[] = {
14671 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14672 METH_O, PyDoc_STR("split the argument as a field name")},
14673 {"formatter_parser", (PyCFunction) formatter_parser,
14674 METH_O, PyDoc_STR("parse the argument as a format string")},
14675 {NULL, NULL}
14676};
14677
14678static struct PyModuleDef _string_module = {
14679 PyModuleDef_HEAD_INIT,
14680 "_string",
14681 PyDoc_STR("string helper module"),
14682 0,
14683 _string_methods,
14684 NULL,
14685 NULL,
14686 NULL,
14687 NULL
14688};
14689
14690PyMODINIT_FUNC
14691PyInit__string(void)
14692{
14693 return PyModule_Create(&_string_module);
14694}
14695
14696
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014697#ifdef __cplusplus
14698}
14699#endif