blob: bd0dbb47ad033ee17918dd5b32ed4ff225c9b9a1 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
52 The globals are initialized by the _PyUnicode_Init() API and should
53 not be used before calling that API.
54
55*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000057
58#ifdef __cplusplus
59extern "C" {
60#endif
61
Victor Stinner8faf8212011-12-08 22:14:11 +010062/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
63#define MAX_UNICODE 0x10ffff
64
Victor Stinner910337b2011-10-03 03:20:16 +020065#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020066# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020067#else
68# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
69#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020070
Victor Stinnere90fe6a2011-10-01 16:48:13 +020071#define _PyUnicode_UTF8(op) \
72 (((PyCompactUnicodeObject*)(op))->utf8)
73#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020074 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020075 assert(PyUnicode_IS_READY(op)), \
76 PyUnicode_IS_COMPACT_ASCII(op) ? \
77 ((char*)((PyASCIIObject*)(op) + 1)) : \
78 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020079#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080 (((PyCompactUnicodeObject*)(op))->utf8_length)
81#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((PyASCIIObject*)(op))->length : \
86 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020087#define _PyUnicode_WSTR(op) \
88 (((PyASCIIObject*)(op))->wstr)
89#define _PyUnicode_WSTR_LENGTH(op) \
90 (((PyCompactUnicodeObject*)(op))->wstr_length)
91#define _PyUnicode_LENGTH(op) \
92 (((PyASCIIObject *)(op))->length)
93#define _PyUnicode_STATE(op) \
94 (((PyASCIIObject *)(op))->state)
95#define _PyUnicode_HASH(op) \
96 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020097#define _PyUnicode_KIND(op) \
98 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020099 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#define _PyUnicode_GET_LENGTH(op) \
101 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200102 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200103#define _PyUnicode_DATA_ANY(op) \
104 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105
Victor Stinnere6abb482012-05-02 01:15:40 +0200106/* Optimized version of Py_MAX() to compute the maximum character:
107 use it when your are computing the second argument of PyUnicode_New() */
108#define MAX_MAXCHAR(maxchar1, maxchar2) \
109 ((maxchar1) | (maxchar2))
110
Victor Stinner910337b2011-10-03 03:20:16 +0200111#undef PyUnicode_READY
112#define PyUnicode_READY(op) \
113 (assert(_PyUnicode_CHECK(op)), \
114 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200115 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100116 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200117
Victor Stinnerc379ead2011-10-03 12:52:27 +0200118#define _PyUnicode_SHARE_UTF8(op) \
119 (assert(_PyUnicode_CHECK(op)), \
120 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
121 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
122#define _PyUnicode_SHARE_WSTR(op) \
123 (assert(_PyUnicode_CHECK(op)), \
124 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
125
Victor Stinner829c0ad2011-10-03 01:08:02 +0200126/* true if the Unicode object has an allocated UTF-8 memory block
127 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_HAS_UTF8_MEMORY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (!PyUnicode_IS_COMPACT_ASCII(op) \
131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (_PyUnicode_WSTR(op) && \
139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200149 to_type *_to = (to_type *) to; \
150 const from_type *_iter = (begin); \
151 const from_type *_end = (end); \
152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Walter Dörwald16807132007-05-25 13:52:07 +0000166/* This dictionary holds all interned unicode strings. Note that references
167 to strings in this dictionary are *not* counted in the string's ob_refcnt.
168 When the interned string reaches a refcnt of 0 the string deallocation
169 function will delete the reference from this dictionary.
170
171 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000172 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000173*/
174static PyObject *interned;
175
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000176/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200177static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200179/* List of static strings. */
180static _Py_Identifier *static_strings;
181
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000182/* Single character Unicode strings in the Latin-1 range are being
183 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200184static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185
Christian Heimes190d79e2008-01-30 11:58:22 +0000186/* Fast detection of the most frequent whitespace characters */
187const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000188 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000190/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000C: * FORM FEED */
193/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 1, 1, 1, 1, 1, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* case 0x001C: * FILE SEPARATOR */
197/* case 0x001D: * GROUP SEPARATOR */
198/* case 0x001E: * RECORD SEPARATOR */
199/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 1, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000206
Benjamin Peterson14339b62009-01-31 16:36:08 +0000207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000215};
216
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200217/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200218static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100220static int unicode_modifiable(PyObject *unicode);
221
Victor Stinnerfe226c02011-10-03 03:52:20 +0200222
Alexander Belopolsky40018472011-02-26 01:02:56 +0000223static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200224_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
225static PyObject *
226_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
227static PyObject *
228_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
229
230static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000232 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100233 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000234 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
235
Alexander Belopolsky40018472011-02-26 01:02:56 +0000236static void
237raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300238 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100239 PyObject *unicode,
240 Py_ssize_t startpos, Py_ssize_t endpos,
241 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000242
Christian Heimes190d79e2008-01-30 11:58:22 +0000243/* Same for linebreaks */
244static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000247/* 0x000B, * LINE TABULATION */
248/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000249/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000250 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000252/* 0x001C, * FILE SEPARATOR */
253/* 0x001D, * GROUP SEPARATOR */
254/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000255 0, 0, 0, 0, 1, 1, 1, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000260
Benjamin Peterson14339b62009-01-31 16:36:08 +0000261 0, 0, 0, 0, 0, 0, 0, 0,
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000269};
270
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300271/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
272 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000273Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000274PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000275{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000276#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000278#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 /* This is actually an illegal character, so it should
280 not be passed to unichr. */
281 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282#endif
283}
284
Victor Stinner910337b2011-10-03 03:20:16 +0200285#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200286int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100287_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200288{
289 PyASCIIObject *ascii;
290 unsigned int kind;
291
292 assert(PyUnicode_Check(op));
293
294 ascii = (PyASCIIObject *)op;
295 kind = ascii->state.kind;
296
Victor Stinnera3b334d2011-10-03 13:53:37 +0200297 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200298 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200299 assert(ascii->state.ready == 1);
300 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200301 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200302 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200303 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200304
Victor Stinnera41463c2011-10-04 01:05:08 +0200305 if (ascii->state.compact == 1) {
306 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND
308 || kind == PyUnicode_2BYTE_KIND
309 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200311 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200312 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100313 }
314 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
316
317 data = unicode->data.any;
318 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100319 assert(ascii->length == 0);
320 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert(ascii->state.compact == 0);
322 assert(ascii->state.ascii == 0);
323 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100324 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 assert(ascii->wstr != NULL);
326 assert(data == NULL);
327 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 }
329 else {
330 assert(kind == PyUnicode_1BYTE_KIND
331 || kind == PyUnicode_2BYTE_KIND
332 || kind == PyUnicode_4BYTE_KIND);
333 assert(ascii->state.compact == 0);
334 assert(ascii->state.ready == 1);
335 assert(data != NULL);
336 if (ascii->state.ascii) {
337 assert (compact->utf8 == data);
338 assert (compact->utf8_length == ascii->length);
339 }
340 else
341 assert (compact->utf8 != data);
342 }
343 }
344 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 if (
346#if SIZEOF_WCHAR_T == 2
347 kind == PyUnicode_2BYTE_KIND
348#else
349 kind == PyUnicode_4BYTE_KIND
350#endif
351 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 {
353 assert(ascii->wstr == data);
354 assert(compact->wstr_length == ascii->length);
355 } else
356 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200357 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200358
359 if (compact->utf8 == NULL)
360 assert(compact->utf8_length == 0);
361 if (ascii->wstr == NULL)
362 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200364 /* check that the best kind is used */
365 if (check_content && kind != PyUnicode_WCHAR_KIND)
366 {
367 Py_ssize_t i;
368 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200369 void *data;
370 Py_UCS4 ch;
371
372 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 for (i=0; i < ascii->length; i++)
374 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200375 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200376 if (ch > maxchar)
377 maxchar = ch;
378 }
379 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100380 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100382 assert(maxchar <= 255);
383 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 else
385 assert(maxchar < 128);
386 }
Victor Stinner77faf692011-11-20 18:56:05 +0100387 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100389 assert(maxchar <= 0xFFFF);
390 }
391 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100393 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100394 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200395 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400397 return 1;
398}
Victor Stinner910337b2011-10-03 03:20:16 +0200399#endif
400
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100401static PyObject*
402unicode_result_wchar(PyObject *unicode)
403{
404#ifndef Py_DEBUG
405 Py_ssize_t len;
406
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100407 len = _PyUnicode_WSTR_LENGTH(unicode);
408 if (len == 0) {
409 Py_INCREF(unicode_empty);
410 Py_DECREF(unicode);
411 return unicode_empty;
412 }
413
414 if (len == 1) {
415 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
416 if (ch < 256) {
417 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
418 Py_DECREF(unicode);
419 return latin1_char;
420 }
421 }
422
423 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200424 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425 return NULL;
426 }
427#else
Victor Stinneraa771272012-10-04 02:32:58 +0200428 assert(Py_REFCNT(unicode) == 1);
429
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100430 /* don't make the result ready in debug mode to ensure that the caller
431 makes the string ready before using it */
432 assert(_PyUnicode_CheckConsistency(unicode, 1));
433#endif
434 return unicode;
435}
436
437static PyObject*
438unicode_result_ready(PyObject *unicode)
439{
440 Py_ssize_t length;
441
442 length = PyUnicode_GET_LENGTH(unicode);
443 if (length == 0) {
444 if (unicode != unicode_empty) {
445 Py_INCREF(unicode_empty);
446 Py_DECREF(unicode);
447 }
448 return unicode_empty;
449 }
450
451 if (length == 1) {
452 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
453 if (ch < 256) {
454 PyObject *latin1_char = unicode_latin1[ch];
455 if (latin1_char != NULL) {
456 if (unicode != latin1_char) {
457 Py_INCREF(latin1_char);
458 Py_DECREF(unicode);
459 }
460 return latin1_char;
461 }
462 else {
463 assert(_PyUnicode_CheckConsistency(unicode, 1));
464 Py_INCREF(unicode);
465 unicode_latin1[ch] = unicode;
466 return unicode;
467 }
468 }
469 }
470
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 return unicode;
473}
474
475static PyObject*
476unicode_result(PyObject *unicode)
477{
478 assert(_PyUnicode_CHECK(unicode));
479 if (PyUnicode_IS_READY(unicode))
480 return unicode_result_ready(unicode);
481 else
482 return unicode_result_wchar(unicode);
483}
484
Victor Stinnerc4b49542011-12-11 22:44:26 +0100485static PyObject*
486unicode_result_unchanged(PyObject *unicode)
487{
488 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500489 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490 return NULL;
491 Py_INCREF(unicode);
492 return unicode;
493 }
494 else
495 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100496 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100497}
498
Victor Stinner3a50e702011-10-18 21:21:00 +0200499#ifdef HAVE_MBCS
500static OSVERSIONINFOEX winver;
501#endif
502
Thomas Wouters477c8d52006-05-27 19:21:47 +0000503/* --- Bloom Filters ----------------------------------------------------- */
504
505/* stuff to implement simple "bloom filters" for Unicode characters.
506 to keep things simple, we use a single bitmask, using the least 5
507 bits from each unicode characters as the bit index. */
508
509/* the linebreak mask is set up by Unicode_Init below */
510
Antoine Pitrouf068f942010-01-13 14:19:12 +0000511#if LONG_BIT >= 128
512#define BLOOM_WIDTH 128
513#elif LONG_BIT >= 64
514#define BLOOM_WIDTH 64
515#elif LONG_BIT >= 32
516#define BLOOM_WIDTH 32
517#else
518#error "LONG_BIT is smaller than 32"
519#endif
520
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521#define BLOOM_MASK unsigned long
522
523static BLOOM_MASK bloom_linebreak;
524
Antoine Pitrouf068f942010-01-13 14:19:12 +0000525#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
526#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000527
Benjamin Peterson29060642009-01-31 22:14:21 +0000528#define BLOOM_LINEBREAK(ch) \
529 ((ch) < 128U ? ascii_linebreak[(ch)] : \
530 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000531
Alexander Belopolsky40018472011-02-26 01:02:56 +0000532Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534{
535 /* calculate simple bloom-style bitmask for a given unicode string */
536
Antoine Pitrouf068f942010-01-13 14:19:12 +0000537 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538 Py_ssize_t i;
539
540 mask = 0;
541 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543
544 return mask;
545}
546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547#define BLOOM_MEMBER(mask, chr, str) \
548 (BLOOM(mask, chr) \
549 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000550
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200551/* Compilation of templated routines */
552
553#include "stringlib/asciilib.h"
554#include "stringlib/fastsearch.h"
555#include "stringlib/partition.h"
556#include "stringlib/split.h"
557#include "stringlib/count.h"
558#include "stringlib/find.h"
559#include "stringlib/find_max_char.h"
560#include "stringlib/localeutil.h"
561#include "stringlib/undef.h"
562
563#include "stringlib/ucs1lib.h"
564#include "stringlib/fastsearch.h"
565#include "stringlib/partition.h"
566#include "stringlib/split.h"
567#include "stringlib/count.h"
568#include "stringlib/find.h"
569#include "stringlib/find_max_char.h"
570#include "stringlib/localeutil.h"
571#include "stringlib/undef.h"
572
573#include "stringlib/ucs2lib.h"
574#include "stringlib/fastsearch.h"
575#include "stringlib/partition.h"
576#include "stringlib/split.h"
577#include "stringlib/count.h"
578#include "stringlib/find.h"
579#include "stringlib/find_max_char.h"
580#include "stringlib/localeutil.h"
581#include "stringlib/undef.h"
582
583#include "stringlib/ucs4lib.h"
584#include "stringlib/fastsearch.h"
585#include "stringlib/partition.h"
586#include "stringlib/split.h"
587#include "stringlib/count.h"
588#include "stringlib/find.h"
589#include "stringlib/find_max_char.h"
590#include "stringlib/localeutil.h"
591#include "stringlib/undef.h"
592
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200593#include "stringlib/unicodedefs.h"
594#include "stringlib/fastsearch.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100597#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599/* --- Unicode Object ----------------------------------------------------- */
600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200601static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200602fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200603
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200604Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
605 Py_ssize_t size, Py_UCS4 ch,
606 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200608 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
609
610 switch (kind) {
611 case PyUnicode_1BYTE_KIND:
612 {
613 Py_UCS1 ch1 = (Py_UCS1) ch;
614 if (ch1 == ch)
615 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
616 else
617 return -1;
618 }
619 case PyUnicode_2BYTE_KIND:
620 {
621 Py_UCS2 ch2 = (Py_UCS2) ch;
622 if (ch2 == ch)
623 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_4BYTE_KIND:
628 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
629 default:
630 assert(0);
631 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633}
634
Victor Stinnerafffce42012-10-03 23:03:17 +0200635#ifdef Py_DEBUG
636/* Fill the data of an Unicode string with invalid characters to detect bugs
637 earlier.
638
639 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
640 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
641 invalid character in Unicode 6.0. */
642static void
643unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
644{
645 int kind = PyUnicode_KIND(unicode);
646 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
647 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
648 if (length <= old_length)
649 return;
650 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
651}
652#endif
653
Victor Stinnerfe226c02011-10-03 03:52:20 +0200654static PyObject*
655resize_compact(PyObject *unicode, Py_ssize_t length)
656{
657 Py_ssize_t char_size;
658 Py_ssize_t struct_size;
659 Py_ssize_t new_size;
660 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100661 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200662#ifdef Py_DEBUG
663 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
664#endif
665
Victor Stinner79891572012-05-03 13:43:07 +0200666 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100668 assert(PyUnicode_IS_COMPACT(unicode));
669
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200670 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100671 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200672 struct_size = sizeof(PyASCIIObject);
673 else
674 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200675 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
678 PyErr_NoMemory();
679 return NULL;
680 }
681 new_size = (struct_size + (length + 1) * char_size);
682
Victor Stinner84def372011-12-11 20:04:56 +0100683 _Py_DEC_REFTOTAL;
684 _Py_ForgetReference(unicode);
685
686 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
687 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100688 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200689 PyErr_NoMemory();
690 return NULL;
691 }
Victor Stinner84def372011-12-11 20:04:56 +0100692 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200696 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100698 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200699 _PyUnicode_WSTR_LENGTH(unicode) = length;
700 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200701#ifdef Py_DEBUG
702 unicode_fill_invalid(unicode, old_length);
703#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
705 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200706 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200707 return unicode;
708}
709
Alexander Belopolsky40018472011-02-26 01:02:56 +0000710static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200711resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712{
Victor Stinner95663112011-10-04 01:03:50 +0200713 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100714 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 if (PyUnicode_IS_READY(unicode)) {
719 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200720 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200722#ifdef Py_DEBUG
723 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
724#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725
726 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200727 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200728 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
729 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730
731 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
732 PyErr_NoMemory();
733 return -1;
734 }
735 new_size = (length + 1) * char_size;
736
Victor Stinner7a9105a2011-12-12 00:13:42 +0100737 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
738 {
739 PyObject_DEL(_PyUnicode_UTF8(unicode));
740 _PyUnicode_UTF8(unicode) = NULL;
741 _PyUnicode_UTF8_LENGTH(unicode) = 0;
742 }
743
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744 data = (PyObject *)PyObject_REALLOC(data, new_size);
745 if (data == NULL) {
746 PyErr_NoMemory();
747 return -1;
748 }
749 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200750 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200751 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200752 _PyUnicode_WSTR_LENGTH(unicode) = length;
753 }
754 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200755 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200756 _PyUnicode_UTF8_LENGTH(unicode) = length;
757 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200758 _PyUnicode_LENGTH(unicode) = length;
759 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200760#ifdef Py_DEBUG
761 unicode_fill_invalid(unicode, old_length);
762#endif
Victor Stinner95663112011-10-04 01:03:50 +0200763 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200764 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 }
Victor Stinner95663112011-10-04 01:03:50 +0200768 assert(_PyUnicode_WSTR(unicode) != NULL);
769
770 /* check for integer overflow */
771 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
772 PyErr_NoMemory();
773 return -1;
774 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100775 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200776 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100777 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200778 if (!wstr) {
779 PyErr_NoMemory();
780 return -1;
781 }
782 _PyUnicode_WSTR(unicode) = wstr;
783 _PyUnicode_WSTR(unicode)[length] = 0;
784 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200785 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000786 return 0;
787}
788
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789static PyObject*
790resize_copy(PyObject *unicode, Py_ssize_t length)
791{
792 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100793 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100795
Benjamin Petersonbac79492012-01-14 13:34:47 -0500796 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100797 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200798
799 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
800 if (copy == NULL)
801 return NULL;
802
803 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200804 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200805 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200806 }
807 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200808 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100809
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200810 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 if (w == NULL)
812 return NULL;
813 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
814 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200815 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
816 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200817 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200818 }
819}
820
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000822 Ux0000 terminated; some code (e.g. new_identifier)
823 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824
825 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000826 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000827
828*/
829
Alexander Belopolsky40018472011-02-26 01:02:56 +0000830static PyUnicodeObject *
831_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832{
833 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200834 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835
Thomas Wouters477c8d52006-05-27 19:21:47 +0000836 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000837 if (length == 0 && unicode_empty != NULL) {
838 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200839 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840 }
841
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000842 /* Ensure we won't overflow the size. */
843 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
844 return (PyUnicodeObject *)PyErr_NoMemory();
845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 if (length < 0) {
847 PyErr_SetString(PyExc_SystemError,
848 "Negative size passed to _PyUnicode_New");
849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000850 }
851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
853 if (unicode == NULL)
854 return NULL;
855 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
856 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
857 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100858 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000859 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100860 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200862
Jeremy Hyltond8082792003-09-16 19:41:39 +0000863 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000864 * the caller fails before initializing str -- unicode_resize()
865 * reads str[0], and the Keep-Alive optimization can keep memory
866 * allocated for str alive across a call to unicode_dealloc(unicode).
867 * We don't want unicode_resize to read uninitialized memory in
868 * that case.
869 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870 _PyUnicode_WSTR(unicode)[0] = 0;
871 _PyUnicode_WSTR(unicode)[length] = 0;
872 _PyUnicode_WSTR_LENGTH(unicode) = length;
873 _PyUnicode_HASH(unicode) = -1;
874 _PyUnicode_STATE(unicode).interned = 0;
875 _PyUnicode_STATE(unicode).kind = 0;
876 _PyUnicode_STATE(unicode).compact = 0;
877 _PyUnicode_STATE(unicode).ready = 0;
878 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200879 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200881 _PyUnicode_UTF8(unicode) = NULL;
882 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100883 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 return unicode;
885}
886
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887static const char*
888unicode_kind_name(PyObject *unicode)
889{
Victor Stinner42dfd712011-10-03 14:41:45 +0200890 /* don't check consistency: unicode_kind_name() is called from
891 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 if (!PyUnicode_IS_COMPACT(unicode))
893 {
894 if (!PyUnicode_IS_READY(unicode))
895 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600896 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 {
898 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200899 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 return "legacy ascii";
901 else
902 return "legacy latin1";
903 case PyUnicode_2BYTE_KIND:
904 return "legacy UCS2";
905 case PyUnicode_4BYTE_KIND:
906 return "legacy UCS4";
907 default:
908 return "<legacy invalid kind>";
909 }
910 }
911 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600912 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200913 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200914 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200915 return "ascii";
916 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200917 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200918 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200919 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200920 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200921 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200922 default:
923 return "<invalid compact kind>";
924 }
925}
926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928/* Functions wrapping macros for use in debugger */
929char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200930 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931}
932
933void *_PyUnicode_compact_data(void *unicode) {
934 return _PyUnicode_COMPACT_DATA(unicode);
935}
936void *_PyUnicode_data(void *unicode){
937 printf("obj %p\n", unicode);
938 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
939 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
940 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
941 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
942 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
943 return PyUnicode_DATA(unicode);
944}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200945
946void
947_PyUnicode_Dump(PyObject *op)
948{
949 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
951 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
952 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200953
Victor Stinnera849a4b2011-10-03 12:12:11 +0200954 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200955 {
956 if (ascii->state.ascii)
957 data = (ascii + 1);
958 else
959 data = (compact + 1);
960 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200961 else
962 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200963 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
964
Victor Stinnera849a4b2011-10-03 12:12:11 +0200965 if (ascii->wstr == data)
966 printf("shared ");
967 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200968
Victor Stinnera3b334d2011-10-03 13:53:37 +0200969 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200970 printf(" (%zu), ", compact->wstr_length);
971 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
972 printf("shared ");
973 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200974 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200975 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200976}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977#endif
978
979PyObject *
980PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
981{
982 PyObject *obj;
983 PyCompactUnicodeObject *unicode;
984 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200985 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200986 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 Py_ssize_t char_size;
988 Py_ssize_t struct_size;
989
990 /* Optimization for empty strings */
991 if (size == 0 && unicode_empty != NULL) {
992 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200993 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 }
995
Victor Stinner9e9d6892011-10-04 01:02:02 +0200996 is_ascii = 0;
997 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200998 struct_size = sizeof(PyCompactUnicodeObject);
999 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001000 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 char_size = 1;
1002 is_ascii = 1;
1003 struct_size = sizeof(PyASCIIObject);
1004 }
1005 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001006 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007 char_size = 1;
1008 }
1009 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001010 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001011 char_size = 2;
1012 if (sizeof(wchar_t) == 2)
1013 is_sharing = 1;
1014 }
1015 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001016 if (maxchar > MAX_UNICODE) {
1017 PyErr_SetString(PyExc_SystemError,
1018 "invalid maximum character passed to PyUnicode_New");
1019 return NULL;
1020 }
Victor Stinner8f825062012-04-27 13:55:39 +02001021 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001022 char_size = 4;
1023 if (sizeof(wchar_t) == 4)
1024 is_sharing = 1;
1025 }
1026
1027 /* Ensure we won't overflow the size. */
1028 if (size < 0) {
1029 PyErr_SetString(PyExc_SystemError,
1030 "Negative size passed to PyUnicode_New");
1031 return NULL;
1032 }
1033 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1034 return PyErr_NoMemory();
1035
1036 /* Duplicated allocation code from _PyObject_New() instead of a call to
1037 * PyObject_New() so we are able to allocate space for the object and
1038 * it's data buffer.
1039 */
1040 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1041 if (obj == NULL)
1042 return PyErr_NoMemory();
1043 obj = PyObject_INIT(obj, &PyUnicode_Type);
1044 if (obj == NULL)
1045 return NULL;
1046
1047 unicode = (PyCompactUnicodeObject *)obj;
1048 if (is_ascii)
1049 data = ((PyASCIIObject*)obj) + 1;
1050 else
1051 data = unicode + 1;
1052 _PyUnicode_LENGTH(unicode) = size;
1053 _PyUnicode_HASH(unicode) = -1;
1054 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001055 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 _PyUnicode_STATE(unicode).compact = 1;
1057 _PyUnicode_STATE(unicode).ready = 1;
1058 _PyUnicode_STATE(unicode).ascii = is_ascii;
1059 if (is_ascii) {
1060 ((char*)data)[size] = 0;
1061 _PyUnicode_WSTR(unicode) = NULL;
1062 }
Victor Stinner8f825062012-04-27 13:55:39 +02001063 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 ((char*)data)[size] = 0;
1065 _PyUnicode_WSTR(unicode) = NULL;
1066 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001068 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 else {
1071 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001072 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001073 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001075 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001076 ((Py_UCS4*)data)[size] = 0;
1077 if (is_sharing) {
1078 _PyUnicode_WSTR_LENGTH(unicode) = size;
1079 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1080 }
1081 else {
1082 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1083 _PyUnicode_WSTR(unicode) = NULL;
1084 }
1085 }
Victor Stinner8f825062012-04-27 13:55:39 +02001086#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001087 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001088#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001089 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 return obj;
1091}
1092
1093#if SIZEOF_WCHAR_T == 2
1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1095 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001096 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097
1098 This function assumes that unicode can hold one more code point than wstr
1099 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001100static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103{
1104 const wchar_t *iter;
1105 Py_UCS4 *ucs4_out;
1106
Victor Stinner910337b2011-10-03 03:20:16 +02001107 assert(unicode != NULL);
1108 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1110 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1111
1112 for (iter = begin; iter < end; ) {
1113 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1114 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001115 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1116 && (iter+1) < end
1117 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 {
Victor Stinner551ac952011-11-29 22:58:13 +01001119 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 iter += 2;
1121 }
1122 else {
1123 *ucs4_out++ = *iter;
1124 iter++;
1125 }
1126 }
1127 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1128 _PyUnicode_GET_LENGTH(unicode)));
1129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130}
1131#endif
1132
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133static int
Victor Stinner488fa492011-12-12 00:01:39 +01001134unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001135{
Victor Stinner488fa492011-12-12 00:01:39 +01001136 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001137 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001138 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return -1;
1140 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141 return 0;
1142}
1143
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144static int
1145_copy_characters(PyObject *to, Py_ssize_t to_start,
1146 PyObject *from, Py_ssize_t from_start,
1147 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 unsigned int from_kind, to_kind;
1150 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151
Victor Stinneree4544c2012-05-09 22:24:08 +02001152 assert(0 <= how_many);
1153 assert(0 <= from_start);
1154 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001155 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001157 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001158
Victor Stinnerd3f08822012-05-29 12:57:52 +02001159 assert(PyUnicode_Check(to));
1160 assert(PyUnicode_IS_READY(to));
1161 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1162
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001163 if (how_many == 0)
1164 return 0;
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001167 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170
Victor Stinnerf1852262012-06-16 16:38:26 +02001171#ifdef Py_DEBUG
1172 if (!check_maxchar
1173 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1174 {
1175 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1176 Py_UCS4 ch;
1177 Py_ssize_t i;
1178 for (i=0; i < how_many; i++) {
1179 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1180 assert(ch <= to_maxchar);
1181 }
1182 }
1183#endif
1184
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001185 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001186 if (check_maxchar
1187 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1188 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001189 /* Writing Latin-1 characters into an ASCII string requires to
1190 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001191 Py_UCS4 max_char;
1192 max_char = ucs1lib_find_max_char(from_data,
1193 (Py_UCS1*)from_data + how_many);
1194 if (max_char >= 128)
1195 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001196 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001197 Py_MEMCPY((char*)to_data + to_kind * to_start,
1198 (char*)from_data + from_kind * from_start,
1199 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001201 else if (from_kind == PyUnicode_1BYTE_KIND
1202 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 {
1204 _PyUnicode_CONVERT_BYTES(
1205 Py_UCS1, Py_UCS2,
1206 PyUnicode_1BYTE_DATA(from) + from_start,
1207 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1208 PyUnicode_2BYTE_DATA(to) + to_start
1209 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001210 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001211 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001212 && to_kind == PyUnicode_4BYTE_KIND)
1213 {
1214 _PyUnicode_CONVERT_BYTES(
1215 Py_UCS1, Py_UCS4,
1216 PyUnicode_1BYTE_DATA(from) + from_start,
1217 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1218 PyUnicode_4BYTE_DATA(to) + to_start
1219 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001220 }
1221 else if (from_kind == PyUnicode_2BYTE_KIND
1222 && to_kind == PyUnicode_4BYTE_KIND)
1223 {
1224 _PyUnicode_CONVERT_BYTES(
1225 Py_UCS2, Py_UCS4,
1226 PyUnicode_2BYTE_DATA(from) + from_start,
1227 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1228 PyUnicode_4BYTE_DATA(to) + to_start
1229 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001230 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001231 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001232 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1233
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001234 if (!check_maxchar) {
1235 if (from_kind == PyUnicode_2BYTE_KIND
1236 && to_kind == PyUnicode_1BYTE_KIND)
1237 {
1238 _PyUnicode_CONVERT_BYTES(
1239 Py_UCS2, Py_UCS1,
1240 PyUnicode_2BYTE_DATA(from) + from_start,
1241 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1242 PyUnicode_1BYTE_DATA(to) + to_start
1243 );
1244 }
1245 else if (from_kind == PyUnicode_4BYTE_KIND
1246 && to_kind == PyUnicode_1BYTE_KIND)
1247 {
1248 _PyUnicode_CONVERT_BYTES(
1249 Py_UCS4, Py_UCS1,
1250 PyUnicode_4BYTE_DATA(from) + from_start,
1251 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1252 PyUnicode_1BYTE_DATA(to) + to_start
1253 );
1254 }
1255 else if (from_kind == PyUnicode_4BYTE_KIND
1256 && to_kind == PyUnicode_2BYTE_KIND)
1257 {
1258 _PyUnicode_CONVERT_BYTES(
1259 Py_UCS4, Py_UCS2,
1260 PyUnicode_4BYTE_DATA(from) + from_start,
1261 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1262 PyUnicode_2BYTE_DATA(to) + to_start
1263 );
1264 }
1265 else {
1266 assert(0);
1267 return -1;
1268 }
1269 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001270 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001271 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001272 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001273 Py_ssize_t i;
1274
Victor Stinnera0702ab2011-09-29 14:14:38 +02001275 for (i=0; i < how_many; i++) {
1276 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001277 if (ch > to_maxchar)
1278 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001279 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1280 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001281 }
1282 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001283 return 0;
1284}
1285
Victor Stinnerd3f08822012-05-29 12:57:52 +02001286void
1287_PyUnicode_FastCopyCharacters(
1288 PyObject *to, Py_ssize_t to_start,
1289 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001290{
1291 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1292}
1293
1294Py_ssize_t
1295PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1296 PyObject *from, Py_ssize_t from_start,
1297 Py_ssize_t how_many)
1298{
1299 int err;
1300
1301 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1302 PyErr_BadInternalCall();
1303 return -1;
1304 }
1305
Benjamin Petersonbac79492012-01-14 13:34:47 -05001306 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001307 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001308 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001309 return -1;
1310
Victor Stinnerd3f08822012-05-29 12:57:52 +02001311 if (from_start < 0) {
1312 PyErr_SetString(PyExc_IndexError, "string index out of range");
1313 return -1;
1314 }
1315 if (to_start < 0) {
1316 PyErr_SetString(PyExc_IndexError, "string index out of range");
1317 return -1;
1318 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001319 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1320 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1321 PyErr_Format(PyExc_SystemError,
1322 "Cannot write %zi characters at %zi "
1323 "in a string of %zi characters",
1324 how_many, to_start, PyUnicode_GET_LENGTH(to));
1325 return -1;
1326 }
1327
1328 if (how_many == 0)
1329 return 0;
1330
Victor Stinner488fa492011-12-12 00:01:39 +01001331 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001332 return -1;
1333
1334 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1335 if (err) {
1336 PyErr_Format(PyExc_SystemError,
1337 "Cannot copy %s characters "
1338 "into a string of %s characters",
1339 unicode_kind_name(from),
1340 unicode_kind_name(to));
1341 return -1;
1342 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001343 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344}
1345
Victor Stinner17222162011-09-28 22:15:37 +02001346/* Find the maximum code point and count the number of surrogate pairs so a
1347 correct string length can be computed before converting a string to UCS4.
1348 This function counts single surrogates as a character and not as a pair.
1349
1350 Return 0 on success, or -1 on error. */
1351static int
1352find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1353 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354{
1355 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001356 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357
Victor Stinnerc53be962011-10-02 21:33:54 +02001358 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 *num_surrogates = 0;
1360 *maxchar = 0;
1361
1362 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001364 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1365 && (iter+1) < end
1366 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001368 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 iter += 2;
1371 }
1372 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001374 {
1375 ch = *iter;
1376 iter++;
1377 }
1378 if (ch > *maxchar) {
1379 *maxchar = ch;
1380 if (*maxchar > MAX_UNICODE) {
1381 PyErr_Format(PyExc_ValueError,
1382 "character U+%x is not in range [U+0000; U+10ffff]",
1383 ch);
1384 return -1;
1385 }
1386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 }
1388 return 0;
1389}
1390
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001391int
1392_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393{
1394 wchar_t *end;
1395 Py_UCS4 maxchar = 0;
1396 Py_ssize_t num_surrogates;
1397#if SIZEOF_WCHAR_T == 2
1398 Py_ssize_t length_wo_surrogates;
1399#endif
1400
Georg Brandl7597add2011-10-05 16:36:47 +02001401 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001402 strings were created using _PyObject_New() and where no canonical
1403 representation (the str field) has been set yet aka strings
1404 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001405 assert(_PyUnicode_CHECK(unicode));
1406 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001408 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001409 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001410 /* Actually, it should neither be interned nor be anything else: */
1411 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001414 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001415 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417
1418 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001419 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1420 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 PyErr_NoMemory();
1422 return -1;
1423 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001424 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 _PyUnicode_WSTR(unicode), end,
1426 PyUnicode_1BYTE_DATA(unicode));
1427 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1428 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1429 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1430 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001431 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001432 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001433 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 }
1435 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001436 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001437 _PyUnicode_UTF8(unicode) = NULL;
1438 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 }
1440 PyObject_FREE(_PyUnicode_WSTR(unicode));
1441 _PyUnicode_WSTR(unicode) = NULL;
1442 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1443 }
1444 /* In this case we might have to convert down from 4-byte native
1445 wchar_t to 2-byte unicode. */
1446 else if (maxchar < 65536) {
1447 assert(num_surrogates == 0 &&
1448 "FindMaxCharAndNumSurrogatePairs() messed up");
1449
Victor Stinner506f5922011-09-28 22:34:18 +02001450#if SIZEOF_WCHAR_T == 2
1451 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001452 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001453 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1454 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1455 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 _PyUnicode_UTF8(unicode) = NULL;
1457 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001458#else
1459 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001460 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001461 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001462 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001463 PyErr_NoMemory();
1464 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 }
Victor Stinner506f5922011-09-28 22:34:18 +02001466 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1467 _PyUnicode_WSTR(unicode), end,
1468 PyUnicode_2BYTE_DATA(unicode));
1469 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1470 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1471 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001472 _PyUnicode_UTF8(unicode) = NULL;
1473 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001474 PyObject_FREE(_PyUnicode_WSTR(unicode));
1475 _PyUnicode_WSTR(unicode) = NULL;
1476 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1477#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 }
1479 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1480 else {
1481#if SIZEOF_WCHAR_T == 2
1482 /* in case the native representation is 2-bytes, we need to allocate a
1483 new normalized 4-byte version. */
1484 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001485 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1486 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 PyErr_NoMemory();
1488 return -1;
1489 }
1490 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1491 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001492 _PyUnicode_UTF8(unicode) = NULL;
1493 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001494 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1495 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001496 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 PyObject_FREE(_PyUnicode_WSTR(unicode));
1498 _PyUnicode_WSTR(unicode) = NULL;
1499 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1500#else
1501 assert(num_surrogates == 0);
1502
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001505 _PyUnicode_UTF8(unicode) = NULL;
1506 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1508#endif
1509 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1510 }
1511 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001512 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001513 return 0;
1514}
1515
Alexander Belopolsky40018472011-02-26 01:02:56 +00001516static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001517unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518{
Walter Dörwald16807132007-05-25 13:52:07 +00001519 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001520 case SSTATE_NOT_INTERNED:
1521 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001522
Benjamin Peterson29060642009-01-31 22:14:21 +00001523 case SSTATE_INTERNED_MORTAL:
1524 /* revive dead object temporarily for DelItem */
1525 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001526 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001527 Py_FatalError(
1528 "deletion of interned string failed");
1529 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 case SSTATE_INTERNED_IMMORTAL:
1532 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001533
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 default:
1535 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001536 }
1537
Victor Stinner03490912011-10-03 23:45:12 +02001538 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001539 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001540 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001541 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001542 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1543 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001545 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001546}
1547
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548#ifdef Py_DEBUG
1549static int
1550unicode_is_singleton(PyObject *unicode)
1551{
1552 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1553 if (unicode == unicode_empty)
1554 return 1;
1555 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1556 {
1557 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1558 if (ch < 256 && unicode_latin1[ch] == unicode)
1559 return 1;
1560 }
1561 return 0;
1562}
1563#endif
1564
Alexander Belopolsky40018472011-02-26 01:02:56 +00001565static int
Victor Stinner488fa492011-12-12 00:01:39 +01001566unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001567{
Victor Stinner488fa492011-12-12 00:01:39 +01001568 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001569 if (Py_REFCNT(unicode) != 1)
1570 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (_PyUnicode_HASH(unicode) != -1)
1572 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001573 if (PyUnicode_CHECK_INTERNED(unicode))
1574 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001575 if (!PyUnicode_CheckExact(unicode))
1576 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001577#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001578 /* singleton refcount is greater than 1 */
1579 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001580#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 return 1;
1582}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584static int
1585unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1586{
1587 PyObject *unicode;
1588 Py_ssize_t old_length;
1589
1590 assert(p_unicode != NULL);
1591 unicode = *p_unicode;
1592
1593 assert(unicode != NULL);
1594 assert(PyUnicode_Check(unicode));
1595 assert(0 <= length);
1596
Victor Stinner910337b2011-10-03 03:20:16 +02001597 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001598 old_length = PyUnicode_WSTR_LENGTH(unicode);
1599 else
1600 old_length = PyUnicode_GET_LENGTH(unicode);
1601 if (old_length == length)
1602 return 0;
1603
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001604 if (length == 0) {
1605 Py_DECREF(*p_unicode);
1606 *p_unicode = unicode_empty;
1607 Py_INCREF(*p_unicode);
1608 return 0;
1609 }
1610
Victor Stinner488fa492011-12-12 00:01:39 +01001611 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001612 PyObject *copy = resize_copy(unicode, length);
1613 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001614 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 Py_DECREF(*p_unicode);
1616 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618 }
1619
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001621 PyObject *new_unicode = resize_compact(unicode, length);
1622 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001624 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001625 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001626 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001627 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001628}
1629
Alexander Belopolsky40018472011-02-26 01:02:56 +00001630int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 PyObject *unicode;
1634 if (p_unicode == NULL) {
1635 PyErr_BadInternalCall();
1636 return -1;
1637 }
1638 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001639 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001640 {
1641 PyErr_BadInternalCall();
1642 return -1;
1643 }
1644 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001645}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001646
Victor Stinnerc5166102012-02-22 13:55:02 +01001647/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001648
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001649 WARNING: The function doesn't copy the terminating null character and
1650 doesn't check the maximum character (may write a latin1 character in an
1651 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001652static void
1653unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1654 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001655{
1656 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1657 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001658 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001659
1660 switch (kind) {
1661 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001662 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001663#ifdef Py_DEBUG
1664 if (PyUnicode_IS_ASCII(unicode)) {
1665 Py_UCS4 maxchar = ucs1lib_find_max_char(
1666 (const Py_UCS1*)str,
1667 (const Py_UCS1*)str + len);
1668 assert(maxchar < 128);
1669 }
1670#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001671 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001672 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001673 }
1674 case PyUnicode_2BYTE_KIND: {
1675 Py_UCS2 *start = (Py_UCS2 *)data + index;
1676 Py_UCS2 *ucs2 = start;
1677 assert(index <= PyUnicode_GET_LENGTH(unicode));
1678
Victor Stinner184252a2012-06-16 02:57:41 +02001679 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001680 *ucs2 = (Py_UCS2)*str;
1681
1682 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001683 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001684 }
1685 default: {
1686 Py_UCS4 *start = (Py_UCS4 *)data + index;
1687 Py_UCS4 *ucs4 = start;
1688 assert(kind == PyUnicode_4BYTE_KIND);
1689 assert(index <= PyUnicode_GET_LENGTH(unicode));
1690
Victor Stinner184252a2012-06-16 02:57:41 +02001691 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001692 *ucs4 = (Py_UCS4)*str;
1693
1694 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001695 }
1696 }
1697}
1698
1699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700static PyObject*
1701get_latin1_char(unsigned char ch)
1702{
Victor Stinnera464fc12011-10-02 20:39:30 +02001703 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001705 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 if (!unicode)
1707 return NULL;
1708 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001709 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 unicode_latin1[ch] = unicode;
1711 }
1712 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001713 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714}
1715
Alexander Belopolsky40018472011-02-26 01:02:56 +00001716PyObject *
1717PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001719 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720 Py_UCS4 maxchar = 0;
1721 Py_ssize_t num_surrogates;
1722
1723 if (u == NULL)
1724 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001726 /* If the Unicode data is known at construction time, we can apply
1727 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 /* Optimization for empty strings */
1730 if (size == 0 && unicode_empty != NULL) {
1731 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001732 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001733 }
Tim Petersced69f82003-09-16 20:30:58 +00001734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 /* Single character Unicode objects in the Latin-1 range are
1736 shared when using this constructor */
1737 if (size == 1 && *u < 256)
1738 return get_latin1_char((unsigned char)*u);
1739
1740 /* If not empty and not single character, copy the Unicode data
1741 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001742 if (find_maxchar_surrogates(u, u + size,
1743 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 return NULL;
1745
Victor Stinner8faf8212011-12-08 22:14:11 +01001746 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 if (!unicode)
1748 return NULL;
1749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 switch (PyUnicode_KIND(unicode)) {
1751 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001752 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1754 break;
1755 case PyUnicode_2BYTE_KIND:
1756#if Py_UNICODE_SIZE == 2
1757 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1758#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001759 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1761#endif
1762 break;
1763 case PyUnicode_4BYTE_KIND:
1764#if SIZEOF_WCHAR_T == 2
1765 /* This is the only case which has to process surrogates, thus
1766 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001767 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768#else
1769 assert(num_surrogates == 0);
1770 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1771#endif
1772 break;
1773 default:
1774 assert(0 && "Impossible state");
1775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001777 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778}
1779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780PyObject *
1781PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001782{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001783 if (size < 0) {
1784 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001786 return NULL;
1787 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001788 if (u != NULL)
1789 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1790 else
1791 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001792}
1793
Alexander Belopolsky40018472011-02-26 01:02:56 +00001794PyObject *
1795PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001796{
1797 size_t size = strlen(u);
1798 if (size > PY_SSIZE_T_MAX) {
1799 PyErr_SetString(PyExc_OverflowError, "input too long");
1800 return NULL;
1801 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001802 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001803}
1804
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001805PyObject *
1806_PyUnicode_FromId(_Py_Identifier *id)
1807{
1808 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001809 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1810 strlen(id->string),
1811 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001812 if (!id->object)
1813 return NULL;
1814 PyUnicode_InternInPlace(&id->object);
1815 assert(!id->next);
1816 id->next = static_strings;
1817 static_strings = id;
1818 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001819 return id->object;
1820}
1821
1822void
1823_PyUnicode_ClearStaticStrings()
1824{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001825 _Py_Identifier *tmp, *s = static_strings;
1826 while (s) {
1827 Py_DECREF(s->object);
1828 s->object = NULL;
1829 tmp = s->next;
1830 s->next = NULL;
1831 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001832 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001833 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001834}
1835
Benjamin Peterson0df54292012-03-26 14:50:32 -04001836/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001837
Victor Stinnerd3f08822012-05-29 12:57:52 +02001838PyObject*
1839_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001840{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001841 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001842 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001843 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001844#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001845 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001846#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001847 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001848 }
Victor Stinner785938e2011-12-11 20:09:03 +01001849 unicode = PyUnicode_New(size, 127);
1850 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001851 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001852 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1853 assert(_PyUnicode_CheckConsistency(unicode, 1));
1854 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001855}
1856
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001857static Py_UCS4
1858kind_maxchar_limit(unsigned int kind)
1859{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001860 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001861 case PyUnicode_1BYTE_KIND:
1862 return 0x80;
1863 case PyUnicode_2BYTE_KIND:
1864 return 0x100;
1865 case PyUnicode_4BYTE_KIND:
1866 return 0x10000;
1867 default:
1868 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001869 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001870 }
1871}
1872
Victor Stinnere6abb482012-05-02 01:15:40 +02001873Py_LOCAL_INLINE(Py_UCS4)
1874align_maxchar(Py_UCS4 maxchar)
1875{
1876 if (maxchar <= 127)
1877 return 127;
1878 else if (maxchar <= 255)
1879 return 255;
1880 else if (maxchar <= 65535)
1881 return 65535;
1882 else
1883 return MAX_UNICODE;
1884}
1885
Victor Stinner702c7342011-10-05 13:50:52 +02001886static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001887_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001888{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001889 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001890 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001891
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001892 if (size == 0) {
1893 Py_INCREF(unicode_empty);
1894 return unicode_empty;
1895 }
1896 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001897 if (size == 1)
1898 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001899
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001900 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001901 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 if (!res)
1903 return NULL;
1904 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001905 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001907}
1908
Victor Stinnere57b1c02011-09-28 22:20:48 +02001909static PyObject*
1910_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911{
1912 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001913 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001914
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001915 if (size == 0) {
1916 Py_INCREF(unicode_empty);
1917 return unicode_empty;
1918 }
1919 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001920 if (size == 1) {
1921 Py_UCS4 ch = u[0];
1922 if (ch < 256)
1923 return get_latin1_char((unsigned char)ch);
1924
1925 res = PyUnicode_New(1, ch);
1926 if (res == NULL)
1927 return NULL;
1928 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1929 assert(_PyUnicode_CheckConsistency(res, 1));
1930 return res;
1931 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001932
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001933 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001934 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 if (!res)
1936 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001937 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001938 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001939 else {
1940 _PyUnicode_CONVERT_BYTES(
1941 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1942 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001943 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001944 return res;
1945}
1946
Victor Stinnere57b1c02011-09-28 22:20:48 +02001947static PyObject*
1948_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949{
1950 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001951 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001952
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001953 if (size == 0) {
1954 Py_INCREF(unicode_empty);
1955 return unicode_empty;
1956 }
1957 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001958 if (size == 1) {
1959 Py_UCS4 ch = u[0];
1960 if (ch < 256)
1961 return get_latin1_char((unsigned char)ch);
1962
1963 res = PyUnicode_New(1, ch);
1964 if (res == NULL)
1965 return NULL;
1966 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1967 assert(_PyUnicode_CheckConsistency(res, 1));
1968 return res;
1969 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001970
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001971 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001972 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 if (!res)
1974 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001975 if (max_char < 256)
1976 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1977 PyUnicode_1BYTE_DATA(res));
1978 else if (max_char < 0x10000)
1979 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1980 PyUnicode_2BYTE_DATA(res));
1981 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001982 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001983 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 return res;
1985}
1986
1987PyObject*
1988PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1989{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001990 if (size < 0) {
1991 PyErr_SetString(PyExc_ValueError, "size must be positive");
1992 return NULL;
1993 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001994 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001996 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001998 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002000 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002001 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002002 PyErr_SetString(PyExc_SystemError, "invalid kind");
2003 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005}
2006
Victor Stinnerece58de2012-04-23 23:36:38 +02002007Py_UCS4
2008_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2009{
2010 enum PyUnicode_Kind kind;
2011 void *startptr, *endptr;
2012
2013 assert(PyUnicode_IS_READY(unicode));
2014 assert(0 <= start);
2015 assert(end <= PyUnicode_GET_LENGTH(unicode));
2016 assert(start <= end);
2017
2018 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2019 return PyUnicode_MAX_CHAR_VALUE(unicode);
2020
2021 if (start == end)
2022 return 127;
2023
Victor Stinner94d558b2012-04-27 22:26:58 +02002024 if (PyUnicode_IS_ASCII(unicode))
2025 return 127;
2026
Victor Stinnerece58de2012-04-23 23:36:38 +02002027 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002028 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002029 endptr = (char *)startptr + end * kind;
2030 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002031 switch(kind) {
2032 case PyUnicode_1BYTE_KIND:
2033 return ucs1lib_find_max_char(startptr, endptr);
2034 case PyUnicode_2BYTE_KIND:
2035 return ucs2lib_find_max_char(startptr, endptr);
2036 case PyUnicode_4BYTE_KIND:
2037 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002038 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002039 assert(0);
2040 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002041 }
2042}
2043
Victor Stinner25a4b292011-10-06 12:31:55 +02002044/* Ensure that a string uses the most efficient storage, if it is not the
2045 case: create a new string with of the right kind. Write NULL into *p_unicode
2046 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002047static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002048unicode_adjust_maxchar(PyObject **p_unicode)
2049{
2050 PyObject *unicode, *copy;
2051 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002052 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002053 unsigned int kind;
2054
2055 assert(p_unicode != NULL);
2056 unicode = *p_unicode;
2057 assert(PyUnicode_IS_READY(unicode));
2058 if (PyUnicode_IS_ASCII(unicode))
2059 return;
2060
2061 len = PyUnicode_GET_LENGTH(unicode);
2062 kind = PyUnicode_KIND(unicode);
2063 if (kind == PyUnicode_1BYTE_KIND) {
2064 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002065 max_char = ucs1lib_find_max_char(u, u + len);
2066 if (max_char >= 128)
2067 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002068 }
2069 else if (kind == PyUnicode_2BYTE_KIND) {
2070 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002071 max_char = ucs2lib_find_max_char(u, u + len);
2072 if (max_char >= 256)
2073 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002074 }
2075 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002076 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002077 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002078 max_char = ucs4lib_find_max_char(u, u + len);
2079 if (max_char >= 0x10000)
2080 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002081 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002082 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002083 if (copy != NULL)
2084 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002085 Py_DECREF(unicode);
2086 *p_unicode = copy;
2087}
2088
Victor Stinner034f6cf2011-09-30 02:26:44 +02002089PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002090_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002091{
Victor Stinner87af4f22011-11-21 23:03:47 +01002092 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002093 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002094
Victor Stinner034f6cf2011-09-30 02:26:44 +02002095 if (!PyUnicode_Check(unicode)) {
2096 PyErr_BadInternalCall();
2097 return NULL;
2098 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002099 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002100 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002101
Victor Stinner87af4f22011-11-21 23:03:47 +01002102 length = PyUnicode_GET_LENGTH(unicode);
2103 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002104 if (!copy)
2105 return NULL;
2106 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2107
Victor Stinner87af4f22011-11-21 23:03:47 +01002108 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2109 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002110 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002111 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002112}
2113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002114
Victor Stinnerbc603d12011-10-02 01:00:40 +02002115/* Widen Unicode objects to larger buffers. Don't write terminating null
2116 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002117
2118void*
2119_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2120{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002121 Py_ssize_t len;
2122 void *result;
2123 unsigned int skind;
2124
Benjamin Petersonbac79492012-01-14 13:34:47 -05002125 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002126 return NULL;
2127
2128 len = PyUnicode_GET_LENGTH(s);
2129 skind = PyUnicode_KIND(s);
2130 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002131 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002132 return NULL;
2133 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002134 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002135 case PyUnicode_2BYTE_KIND:
2136 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2137 if (!result)
2138 return PyErr_NoMemory();
2139 assert(skind == PyUnicode_1BYTE_KIND);
2140 _PyUnicode_CONVERT_BYTES(
2141 Py_UCS1, Py_UCS2,
2142 PyUnicode_1BYTE_DATA(s),
2143 PyUnicode_1BYTE_DATA(s) + len,
2144 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002146 case PyUnicode_4BYTE_KIND:
2147 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2148 if (!result)
2149 return PyErr_NoMemory();
2150 if (skind == PyUnicode_2BYTE_KIND) {
2151 _PyUnicode_CONVERT_BYTES(
2152 Py_UCS2, Py_UCS4,
2153 PyUnicode_2BYTE_DATA(s),
2154 PyUnicode_2BYTE_DATA(s) + len,
2155 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002156 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002157 else {
2158 assert(skind == PyUnicode_1BYTE_KIND);
2159 _PyUnicode_CONVERT_BYTES(
2160 Py_UCS1, Py_UCS4,
2161 PyUnicode_1BYTE_DATA(s),
2162 PyUnicode_1BYTE_DATA(s) + len,
2163 result);
2164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002166 default:
2167 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 }
Victor Stinner01698042011-10-04 00:04:26 +02002169 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 return NULL;
2171}
2172
2173static Py_UCS4*
2174as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2175 int copy_null)
2176{
2177 int kind;
2178 void *data;
2179 Py_ssize_t len, targetlen;
2180 if (PyUnicode_READY(string) == -1)
2181 return NULL;
2182 kind = PyUnicode_KIND(string);
2183 data = PyUnicode_DATA(string);
2184 len = PyUnicode_GET_LENGTH(string);
2185 targetlen = len;
2186 if (copy_null)
2187 targetlen++;
2188 if (!target) {
2189 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2190 PyErr_NoMemory();
2191 return NULL;
2192 }
2193 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2194 if (!target) {
2195 PyErr_NoMemory();
2196 return NULL;
2197 }
2198 }
2199 else {
2200 if (targetsize < targetlen) {
2201 PyErr_Format(PyExc_SystemError,
2202 "string is longer than the buffer");
2203 if (copy_null && 0 < targetsize)
2204 target[0] = 0;
2205 return NULL;
2206 }
2207 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002208 if (kind == PyUnicode_1BYTE_KIND) {
2209 Py_UCS1 *start = (Py_UCS1 *) data;
2210 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002212 else if (kind == PyUnicode_2BYTE_KIND) {
2213 Py_UCS2 *start = (Py_UCS2 *) data;
2214 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2215 }
2216 else {
2217 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 if (copy_null)
2221 target[len] = 0;
2222 return target;
2223}
2224
2225Py_UCS4*
2226PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2227 int copy_null)
2228{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002229 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 PyErr_BadInternalCall();
2231 return NULL;
2232 }
2233 return as_ucs4(string, target, targetsize, copy_null);
2234}
2235
2236Py_UCS4*
2237PyUnicode_AsUCS4Copy(PyObject *string)
2238{
2239 return as_ucs4(string, NULL, 0, 1);
2240}
2241
2242#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002243
Alexander Belopolsky40018472011-02-26 01:02:56 +00002244PyObject *
2245PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002248 if (size == 0) {
2249 Py_INCREF(unicode_empty);
2250 return unicode_empty;
2251 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002252 PyErr_BadInternalCall();
2253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254 }
2255
Martin v. Löwis790465f2008-04-05 20:41:37 +00002256 if (size == -1) {
2257 size = wcslen(w);
2258 }
2259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261}
2262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002264
Walter Dörwald346737f2007-05-31 10:44:43 +00002265static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002266makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002267 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002268{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002270 if (longflag)
2271 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002272 else if (longlongflag) {
2273 /* longlongflag should only ever be nonzero on machines with
2274 HAVE_LONG_LONG defined */
2275#ifdef HAVE_LONG_LONG
2276 char *f = PY_FORMAT_LONG_LONG;
2277 while (*f)
2278 *fmt++ = *f++;
2279#else
2280 /* we shouldn't ever get here */
2281 assert(0);
2282 *fmt++ = 'l';
2283#endif
2284 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002285 else if (size_tflag) {
2286 char *f = PY_FORMAT_SIZE_T;
2287 while (*f)
2288 *fmt++ = *f++;
2289 }
2290 *fmt++ = c;
2291 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002292}
2293
Victor Stinner15a11362012-10-06 23:48:20 +02002294/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002295 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2296 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2297#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002298
2299static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002300unicode_fromformat_arg(_PyUnicodeWriter *writer,
2301 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002302{
Victor Stinnere215d962012-10-06 23:03:36 +02002303 const char *p;
2304 Py_ssize_t len;
2305 int zeropad;
2306 int width;
2307 int precision;
2308 int longflag;
2309 int longlongflag;
2310 int size_tflag;
2311 int fill;
2312
2313 p = f;
2314 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002315 zeropad = 0;
2316 if (*f == '0') {
2317 zeropad = 1;
2318 f++;
2319 }
Victor Stinner96865452011-03-01 23:44:09 +00002320
2321 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002322 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002323 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002324 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2325 PyErr_SetString(PyExc_ValueError,
2326 "width too big");
2327 return NULL;
2328 }
Victor Stinnere215d962012-10-06 23:03:36 +02002329 width = (width*10) + (*f - '0');
2330 f++;
2331 }
Victor Stinner96865452011-03-01 23:44:09 +00002332 precision = 0;
2333 if (*f == '.') {
2334 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002335 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002336 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2337 PyErr_SetString(PyExc_ValueError,
2338 "precision too big");
2339 return NULL;
2340 }
Victor Stinnere215d962012-10-06 23:03:36 +02002341 precision = (precision*10) + (*f - '0');
2342 f++;
2343 }
Victor Stinner96865452011-03-01 23:44:09 +00002344 if (*f == '%') {
2345 /* "%.3%s" => f points to "3" */
2346 f--;
2347 }
2348 }
2349 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002350 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002351 f--;
2352 }
Victor Stinner96865452011-03-01 23:44:09 +00002353
2354 /* Handle %ld, %lu, %lld and %llu. */
2355 longflag = 0;
2356 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002357 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002358 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002359 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002360 longflag = 1;
2361 ++f;
2362 }
2363#ifdef HAVE_LONG_LONG
2364 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002365 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002366 longlongflag = 1;
2367 f += 2;
2368 }
2369#endif
2370 }
2371 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002372 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002373 size_tflag = 1;
2374 ++f;
2375 }
Victor Stinnere215d962012-10-06 23:03:36 +02002376
2377 if (f[1] == '\0')
2378 writer->overallocate = 0;
2379
2380 switch (*f) {
2381 case 'c':
2382 {
2383 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002384 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2385 PyErr_SetString(PyExc_ValueError,
2386 "character argument not in range(0x110000)");
2387 return NULL;
2388 }
Victor Stinnere215d962012-10-06 23:03:36 +02002389 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2390 return NULL;
2391 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2392 writer->pos++;
2393 break;
2394 }
2395
2396 case 'i':
2397 case 'd':
2398 case 'u':
2399 case 'x':
2400 {
2401 /* used by sprintf */
2402 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002403 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002404
2405 if (*f == 'u') {
2406 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2407
2408 if (longflag)
2409 len = sprintf(buffer, fmt,
2410 va_arg(*vargs, unsigned long));
2411#ifdef HAVE_LONG_LONG
2412 else if (longlongflag)
2413 len = sprintf(buffer, fmt,
2414 va_arg(*vargs, unsigned PY_LONG_LONG));
2415#endif
2416 else if (size_tflag)
2417 len = sprintf(buffer, fmt,
2418 va_arg(*vargs, size_t));
2419 else
2420 len = sprintf(buffer, fmt,
2421 va_arg(*vargs, unsigned int));
2422 }
2423 else if (*f == 'x') {
2424 makefmt(fmt, 0, 0, 0, 'x');
2425 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2426 }
2427 else {
2428 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2429
2430 if (longflag)
2431 len = sprintf(buffer, fmt,
2432 va_arg(*vargs, long));
2433#ifdef HAVE_LONG_LONG
2434 else if (longlongflag)
2435 len = sprintf(buffer, fmt,
2436 va_arg(*vargs, PY_LONG_LONG));
2437#endif
2438 else if (size_tflag)
2439 len = sprintf(buffer, fmt,
2440 va_arg(*vargs, Py_ssize_t));
2441 else
2442 len = sprintf(buffer, fmt,
2443 va_arg(*vargs, int));
2444 }
2445 assert(len >= 0);
2446
Victor Stinnere215d962012-10-06 23:03:36 +02002447 if (precision < len)
2448 precision = len;
2449 if (width > precision) {
2450 Py_UCS4 fillchar;
2451 fill = width - precision;
2452 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002453 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2454 return NULL;
2455 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2456 return NULL;
2457 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002458 }
Victor Stinner15a11362012-10-06 23:48:20 +02002459 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002460 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002461 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2462 return NULL;
2463 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2464 return NULL;
2465 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002466 }
Victor Stinner15a11362012-10-06 23:48:20 +02002467 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002468 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002469 break;
2470 }
2471
2472 case 'p':
2473 {
2474 char number[MAX_LONG_LONG_CHARS];
2475
2476 len = sprintf(number, "%p", va_arg(*vargs, void*));
2477 assert(len >= 0);
2478
2479 /* %p is ill-defined: ensure leading 0x. */
2480 if (number[1] == 'X')
2481 number[1] = 'x';
2482 else if (number[1] != 'x') {
2483 memmove(number + 2, number,
2484 strlen(number) + 1);
2485 number[0] = '0';
2486 number[1] = 'x';
2487 len += 2;
2488 }
2489
2490 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2491 return NULL;
2492 break;
2493 }
2494
2495 case 's':
2496 {
2497 /* UTF-8 */
2498 const char *s = va_arg(*vargs, const char*);
2499 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2500 if (!str)
2501 return NULL;
2502 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2503 Py_DECREF(str);
2504 return NULL;
2505 }
2506 Py_DECREF(str);
2507 break;
2508 }
2509
2510 case 'U':
2511 {
2512 PyObject *obj = va_arg(*vargs, PyObject *);
2513 assert(obj && _PyUnicode_CHECK(obj));
2514
2515 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2516 return NULL;
2517 break;
2518 }
2519
2520 case 'V':
2521 {
2522 PyObject *obj = va_arg(*vargs, PyObject *);
2523 const char *str = va_arg(*vargs, const char *);
2524 PyObject *str_obj;
2525 assert(obj || str);
2526 if (obj) {
2527 assert(_PyUnicode_CHECK(obj));
2528 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2529 return NULL;
2530 }
2531 else {
2532 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2533 if (!str_obj)
2534 return NULL;
2535 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2536 Py_DECREF(str_obj);
2537 return NULL;
2538 }
2539 Py_DECREF(str_obj);
2540 }
2541 break;
2542 }
2543
2544 case 'S':
2545 {
2546 PyObject *obj = va_arg(*vargs, PyObject *);
2547 PyObject *str;
2548 assert(obj);
2549 str = PyObject_Str(obj);
2550 if (!str)
2551 return NULL;
2552 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2553 Py_DECREF(str);
2554 return NULL;
2555 }
2556 Py_DECREF(str);
2557 break;
2558 }
2559
2560 case 'R':
2561 {
2562 PyObject *obj = va_arg(*vargs, PyObject *);
2563 PyObject *repr;
2564 assert(obj);
2565 repr = PyObject_Repr(obj);
2566 if (!repr)
2567 return NULL;
2568 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2569 Py_DECREF(repr);
2570 return NULL;
2571 }
2572 Py_DECREF(repr);
2573 break;
2574 }
2575
2576 case 'A':
2577 {
2578 PyObject *obj = va_arg(*vargs, PyObject *);
2579 PyObject *ascii;
2580 assert(obj);
2581 ascii = PyObject_ASCII(obj);
2582 if (!ascii)
2583 return NULL;
2584 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2585 Py_DECREF(ascii);
2586 return NULL;
2587 }
2588 Py_DECREF(ascii);
2589 break;
2590 }
2591
2592 case '%':
2593 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2594 return NULL;
2595 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2596 writer->pos++;
2597 break;
2598
2599 default:
2600 /* if we stumble upon an unknown formatting code, copy the rest
2601 of the format string to the output string. (we cannot just
2602 skip the code, since there's no way to know what's in the
2603 argument list) */
2604 len = strlen(p);
2605 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2606 return NULL;
2607 f = p+len;
2608 return f;
2609 }
2610
2611 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002612 return f;
2613}
2614
Walter Dörwaldd2034312007-05-18 16:29:38 +00002615PyObject *
2616PyUnicode_FromFormatV(const char *format, va_list vargs)
2617{
Victor Stinnere215d962012-10-06 23:03:36 +02002618 va_list vargs2;
2619 const char *f;
2620 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002621
Victor Stinnere215d962012-10-06 23:03:36 +02002622 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2623
2624 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2625 Copy it to be able to pass a reference to a subfunction. */
2626 Py_VA_COPY(vargs2, vargs);
2627
2628 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002629 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002630 f = unicode_fromformat_arg(&writer, f, &vargs2);
2631 if (f == NULL)
2632 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002635 const char *p;
2636 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002637
Victor Stinnere215d962012-10-06 23:03:36 +02002638 p = f;
2639 do
2640 {
2641 if ((unsigned char)*p > 127) {
2642 PyErr_Format(PyExc_ValueError,
2643 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2644 "string, got a non-ASCII byte: 0x%02x",
2645 (unsigned char)*p);
2646 return NULL;
2647 }
2648 p++;
2649 }
2650 while (*p != '\0' && *p != '%');
2651 len = p - f;
2652
2653 if (*p == '\0')
2654 writer.overallocate = 0;
2655 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2656 goto fail;
2657 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2658 writer.pos += len;
2659
2660 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002661 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002662 }
Victor Stinnere215d962012-10-06 23:03:36 +02002663 return _PyUnicodeWriter_Finish(&writer);
2664
2665 fail:
2666 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002667 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002668}
2669
Walter Dörwaldd2034312007-05-18 16:29:38 +00002670PyObject *
2671PyUnicode_FromFormat(const char *format, ...)
2672{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 PyObject* ret;
2674 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002675
2676#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002677 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002678#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002679 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002680#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002681 ret = PyUnicode_FromFormatV(format, vargs);
2682 va_end(vargs);
2683 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002684}
2685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686#ifdef HAVE_WCHAR_H
2687
Victor Stinner5593d8a2010-10-02 11:11:27 +00002688/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2689 convert a Unicode object to a wide character string.
2690
Victor Stinnerd88d9832011-09-06 02:00:05 +02002691 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002692 character) required to convert the unicode object. Ignore size argument.
2693
Victor Stinnerd88d9832011-09-06 02:00:05 +02002694 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002695 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002696 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002697static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002698unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002699 wchar_t *w,
2700 Py_ssize_t size)
2701{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002702 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002703 const wchar_t *wstr;
2704
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002705 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002706 if (wstr == NULL)
2707 return -1;
2708
Victor Stinner5593d8a2010-10-02 11:11:27 +00002709 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002710 if (size > res)
2711 size = res + 1;
2712 else
2713 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002715 return res;
2716 }
2717 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002719}
2720
2721Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002722PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002723 wchar_t *w,
2724 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725{
2726 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002727 PyErr_BadInternalCall();
2728 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002730 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731}
2732
Victor Stinner137c34c2010-09-29 10:25:54 +00002733wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002734PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002735 Py_ssize_t *size)
2736{
2737 wchar_t* buffer;
2738 Py_ssize_t buflen;
2739
2740 if (unicode == NULL) {
2741 PyErr_BadInternalCall();
2742 return NULL;
2743 }
2744
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002745 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002746 if (buflen == -1)
2747 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002748 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002749 PyErr_NoMemory();
2750 return NULL;
2751 }
2752
Victor Stinner137c34c2010-09-29 10:25:54 +00002753 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2754 if (buffer == NULL) {
2755 PyErr_NoMemory();
2756 return NULL;
2757 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002758 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002759 if (buflen == -1) {
2760 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002762 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002763 if (size != NULL)
2764 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002765 return buffer;
2766}
2767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002768#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769
Alexander Belopolsky40018472011-02-26 01:02:56 +00002770PyObject *
2771PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002774 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002775 PyErr_SetString(PyExc_ValueError,
2776 "chr() arg not in range(0x110000)");
2777 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002778 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 if (ordinal < 256)
2781 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 v = PyUnicode_New(1, ordinal);
2784 if (v == NULL)
2785 return NULL;
2786 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002787 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002789}
2790
Alexander Belopolsky40018472011-02-26 01:02:56 +00002791PyObject *
2792PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002794 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002795 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002796 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002797 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002798 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002799 Py_INCREF(obj);
2800 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002801 }
2802 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002803 /* For a Unicode subtype that's not a Unicode object,
2804 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002805 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002806 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002807 PyErr_Format(PyExc_TypeError,
2808 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002809 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002810 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002811}
2812
Alexander Belopolsky40018472011-02-26 01:02:56 +00002813PyObject *
2814PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002815 const char *encoding,
2816 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002817{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002818 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002819 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002820
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002822 PyErr_BadInternalCall();
2823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002825
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002826 /* Decoding bytes objects is the most common case and should be fast */
2827 if (PyBytes_Check(obj)) {
2828 if (PyBytes_GET_SIZE(obj) == 0) {
2829 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002830 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002831 }
2832 else {
2833 v = PyUnicode_Decode(
2834 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2835 encoding, errors);
2836 }
2837 return v;
2838 }
2839
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002840 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002841 PyErr_SetString(PyExc_TypeError,
2842 "decoding str is not supported");
2843 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002844 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002845
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002846 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2847 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2848 PyErr_Format(PyExc_TypeError,
2849 "coercing to str: need bytes, bytearray "
2850 "or buffer-like object, %.80s found",
2851 Py_TYPE(obj)->tp_name);
2852 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002853 }
Tim Petersced69f82003-09-16 20:30:58 +00002854
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002855 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002856 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002857 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858 }
Tim Petersced69f82003-09-16 20:30:58 +00002859 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002860 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002861
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002862 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002863 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864}
2865
Victor Stinner600d3be2010-06-10 12:00:55 +00002866/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002867 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2868 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002869int
2870_Py_normalize_encoding(const char *encoding,
2871 char *lower,
2872 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002874 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002875 char *l;
2876 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002877
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002878 if (encoding == NULL) {
2879 strcpy(lower, "utf-8");
2880 return 1;
2881 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002882 e = encoding;
2883 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002884 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002885 while (*e) {
2886 if (l == l_end)
2887 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002888 if (Py_ISUPPER(*e)) {
2889 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002890 }
2891 else if (*e == '_') {
2892 *l++ = '-';
2893 e++;
2894 }
2895 else {
2896 *l++ = *e++;
2897 }
2898 }
2899 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002900 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002901}
2902
Alexander Belopolsky40018472011-02-26 01:02:56 +00002903PyObject *
2904PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002905 Py_ssize_t size,
2906 const char *encoding,
2907 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002908{
2909 PyObject *buffer = NULL, *unicode;
2910 Py_buffer info;
2911 char lower[11]; /* Enough for any encoding shortcut */
2912
Fred Drakee4315f52000-05-09 19:53:39 +00002913 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002914 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002915 if ((strcmp(lower, "utf-8") == 0) ||
2916 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002917 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002918 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002919 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002920 (strcmp(lower, "iso-8859-1") == 0))
2921 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002922#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002923 else if (strcmp(lower, "mbcs") == 0)
2924 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002925#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002926 else if (strcmp(lower, "ascii") == 0)
2927 return PyUnicode_DecodeASCII(s, size, errors);
2928 else if (strcmp(lower, "utf-16") == 0)
2929 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2930 else if (strcmp(lower, "utf-32") == 0)
2931 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933
2934 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002935 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002936 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002937 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002938 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 if (buffer == NULL)
2940 goto onError;
2941 unicode = PyCodec_Decode(buffer, encoding, errors);
2942 if (unicode == NULL)
2943 goto onError;
2944 if (!PyUnicode_Check(unicode)) {
2945 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002946 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002947 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002948 Py_DECREF(unicode);
2949 goto onError;
2950 }
2951 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002952 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002953
Benjamin Peterson29060642009-01-31 22:14:21 +00002954 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 Py_XDECREF(buffer);
2956 return NULL;
2957}
2958
Alexander Belopolsky40018472011-02-26 01:02:56 +00002959PyObject *
2960PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002961 const char *encoding,
2962 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002963{
2964 PyObject *v;
2965
2966 if (!PyUnicode_Check(unicode)) {
2967 PyErr_BadArgument();
2968 goto onError;
2969 }
2970
2971 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002972 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002973
2974 /* Decode via the codec registry */
2975 v = PyCodec_Decode(unicode, encoding, errors);
2976 if (v == NULL)
2977 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002978 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002979
Benjamin Peterson29060642009-01-31 22:14:21 +00002980 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002981 return NULL;
2982}
2983
Alexander Belopolsky40018472011-02-26 01:02:56 +00002984PyObject *
2985PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002986 const char *encoding,
2987 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002988{
2989 PyObject *v;
2990
2991 if (!PyUnicode_Check(unicode)) {
2992 PyErr_BadArgument();
2993 goto onError;
2994 }
2995
2996 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002997 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002998
2999 /* Decode via the codec registry */
3000 v = PyCodec_Decode(unicode, encoding, errors);
3001 if (v == NULL)
3002 goto onError;
3003 if (!PyUnicode_Check(v)) {
3004 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003005 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003006 Py_TYPE(v)->tp_name);
3007 Py_DECREF(v);
3008 goto onError;
3009 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003010 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003011
Benjamin Peterson29060642009-01-31 22:14:21 +00003012 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003013 return NULL;
3014}
3015
Alexander Belopolsky40018472011-02-26 01:02:56 +00003016PyObject *
3017PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003018 Py_ssize_t size,
3019 const char *encoding,
3020 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021{
3022 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003023
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 unicode = PyUnicode_FromUnicode(s, size);
3025 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3028 Py_DECREF(unicode);
3029 return v;
3030}
3031
Alexander Belopolsky40018472011-02-26 01:02:56 +00003032PyObject *
3033PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003034 const char *encoding,
3035 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003036{
3037 PyObject *v;
3038
3039 if (!PyUnicode_Check(unicode)) {
3040 PyErr_BadArgument();
3041 goto onError;
3042 }
3043
3044 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003046
3047 /* Encode via the codec registry */
3048 v = PyCodec_Encode(unicode, encoding, errors);
3049 if (v == NULL)
3050 goto onError;
3051 return v;
3052
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003054 return NULL;
3055}
3056
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003057static size_t
3058wcstombs_errorpos(const wchar_t *wstr)
3059{
3060 size_t len;
3061#if SIZEOF_WCHAR_T == 2
3062 wchar_t buf[3];
3063#else
3064 wchar_t buf[2];
3065#endif
3066 char outbuf[MB_LEN_MAX];
3067 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003068
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003069#if SIZEOF_WCHAR_T == 2
3070 buf[2] = 0;
3071#else
3072 buf[1] = 0;
3073#endif
3074 start = wstr;
3075 while (*wstr != L'\0')
3076 {
3077 previous = wstr;
3078#if SIZEOF_WCHAR_T == 2
3079 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3080 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3081 {
3082 buf[0] = wstr[0];
3083 buf[1] = wstr[1];
3084 wstr += 2;
3085 }
3086 else {
3087 buf[0] = *wstr;
3088 buf[1] = 0;
3089 wstr++;
3090 }
3091#else
3092 buf[0] = *wstr;
3093 wstr++;
3094#endif
3095 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003096 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003097 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003098 }
3099
3100 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003101 return 0;
3102}
3103
Victor Stinner1b579672011-12-17 05:47:23 +01003104static int
3105locale_error_handler(const char *errors, int *surrogateescape)
3106{
3107 if (errors == NULL) {
3108 *surrogateescape = 0;
3109 return 0;
3110 }
3111
3112 if (strcmp(errors, "strict") == 0) {
3113 *surrogateescape = 0;
3114 return 0;
3115 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003116 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003117 *surrogateescape = 1;
3118 return 0;
3119 }
3120 PyErr_Format(PyExc_ValueError,
3121 "only 'strict' and 'surrogateescape' error handlers "
3122 "are supported, not '%s'",
3123 errors);
3124 return -1;
3125}
3126
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003127PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003128PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003129{
3130 Py_ssize_t wlen, wlen2;
3131 wchar_t *wstr;
3132 PyObject *bytes = NULL;
3133 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003134 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003135 PyObject *exc;
3136 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003137 int surrogateescape;
3138
3139 if (locale_error_handler(errors, &surrogateescape) < 0)
3140 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003141
3142 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3143 if (wstr == NULL)
3144 return NULL;
3145
3146 wlen2 = wcslen(wstr);
3147 if (wlen2 != wlen) {
3148 PyMem_Free(wstr);
3149 PyErr_SetString(PyExc_TypeError, "embedded null character");
3150 return NULL;
3151 }
3152
3153 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003154 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003155 char *str;
3156
3157 str = _Py_wchar2char(wstr, &error_pos);
3158 if (str == NULL) {
3159 if (error_pos == (size_t)-1) {
3160 PyErr_NoMemory();
3161 PyMem_Free(wstr);
3162 return NULL;
3163 }
3164 else {
3165 goto encode_error;
3166 }
3167 }
3168 PyMem_Free(wstr);
3169
3170 bytes = PyBytes_FromString(str);
3171 PyMem_Free(str);
3172 }
3173 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003174 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003175 size_t len, len2;
3176
3177 len = wcstombs(NULL, wstr, 0);
3178 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003179 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003180 goto encode_error;
3181 }
3182
3183 bytes = PyBytes_FromStringAndSize(NULL, len);
3184 if (bytes == NULL) {
3185 PyMem_Free(wstr);
3186 return NULL;
3187 }
3188
3189 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3190 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003191 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003192 goto encode_error;
3193 }
3194 PyMem_Free(wstr);
3195 }
3196 return bytes;
3197
3198encode_error:
3199 errmsg = strerror(errno);
3200 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003201
3202 if (error_pos == (size_t)-1)
3203 error_pos = wcstombs_errorpos(wstr);
3204
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003205 PyMem_Free(wstr);
3206 Py_XDECREF(bytes);
3207
Victor Stinner2f197072011-12-17 07:08:30 +01003208 if (errmsg != NULL) {
3209 size_t errlen;
3210 wstr = _Py_char2wchar(errmsg, &errlen);
3211 if (wstr != NULL) {
3212 reason = PyUnicode_FromWideChar(wstr, errlen);
3213 PyMem_Free(wstr);
3214 } else
3215 errmsg = NULL;
3216 }
3217 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003218 reason = PyUnicode_FromString(
3219 "wcstombs() encountered an unencodable "
3220 "wide character");
3221 if (reason == NULL)
3222 return NULL;
3223
3224 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3225 "locale", unicode,
3226 (Py_ssize_t)error_pos,
3227 (Py_ssize_t)(error_pos+1),
3228 reason);
3229 Py_DECREF(reason);
3230 if (exc != NULL) {
3231 PyCodec_StrictErrors(exc);
3232 Py_XDECREF(exc);
3233 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003234 return NULL;
3235}
3236
Victor Stinnerad158722010-10-27 00:25:46 +00003237PyObject *
3238PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003239{
Victor Stinner99b95382011-07-04 14:23:54 +02003240#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003241 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003242#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003243 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003244#else
Victor Stinner793b5312011-04-27 00:24:21 +02003245 PyInterpreterState *interp = PyThreadState_GET()->interp;
3246 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3247 cannot use it to encode and decode filenames before it is loaded. Load
3248 the Python codec requires to encode at least its own filename. Use the C
3249 version of the locale codec until the codec registry is initialized and
3250 the Python codec is loaded.
3251
3252 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3253 cannot only rely on it: check also interp->fscodec_initialized for
3254 subinterpreters. */
3255 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003256 return PyUnicode_AsEncodedString(unicode,
3257 Py_FileSystemDefaultEncoding,
3258 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003259 }
3260 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003261 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003262 }
Victor Stinnerad158722010-10-27 00:25:46 +00003263#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003264}
3265
Alexander Belopolsky40018472011-02-26 01:02:56 +00003266PyObject *
3267PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003268 const char *encoding,
3269 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270{
3271 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003272 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003273
Guido van Rossumd57fd912000-03-10 22:53:23 +00003274 if (!PyUnicode_Check(unicode)) {
3275 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277 }
Fred Drakee4315f52000-05-09 19:53:39 +00003278
Fred Drakee4315f52000-05-09 19:53:39 +00003279 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003280 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003281 if ((strcmp(lower, "utf-8") == 0) ||
3282 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003283 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003284 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003285 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003286 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003287 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003288 }
Victor Stinner37296e82010-06-10 13:36:23 +00003289 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003290 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003291 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003292 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003293#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003294 else if (strcmp(lower, "mbcs") == 0)
3295 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003296#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003297 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003298 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300
3301 /* Encode via the codec registry */
3302 v = PyCodec_Encode(unicode, encoding, errors);
3303 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003304 return NULL;
3305
3306 /* The normal path */
3307 if (PyBytes_Check(v))
3308 return v;
3309
3310 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003311 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003312 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003313 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003314
3315 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3316 "encoder %s returned bytearray instead of bytes",
3317 encoding);
3318 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003319 Py_DECREF(v);
3320 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003321 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003322
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003323 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3324 Py_DECREF(v);
3325 return b;
3326 }
3327
3328 PyErr_Format(PyExc_TypeError,
3329 "encoder did not return a bytes object (type=%.400s)",
3330 Py_TYPE(v)->tp_name);
3331 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003332 return NULL;
3333}
3334
Alexander Belopolsky40018472011-02-26 01:02:56 +00003335PyObject *
3336PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003337 const char *encoding,
3338 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003339{
3340 PyObject *v;
3341
3342 if (!PyUnicode_Check(unicode)) {
3343 PyErr_BadArgument();
3344 goto onError;
3345 }
3346
3347 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003348 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003349
3350 /* Encode via the codec registry */
3351 v = PyCodec_Encode(unicode, encoding, errors);
3352 if (v == NULL)
3353 goto onError;
3354 if (!PyUnicode_Check(v)) {
3355 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003356 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003357 Py_TYPE(v)->tp_name);
3358 Py_DECREF(v);
3359 goto onError;
3360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003362
Benjamin Peterson29060642009-01-31 22:14:21 +00003363 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 return NULL;
3365}
3366
Victor Stinner2f197072011-12-17 07:08:30 +01003367static size_t
3368mbstowcs_errorpos(const char *str, size_t len)
3369{
3370#ifdef HAVE_MBRTOWC
3371 const char *start = str;
3372 mbstate_t mbs;
3373 size_t converted;
3374 wchar_t ch;
3375
3376 memset(&mbs, 0, sizeof mbs);
3377 while (len)
3378 {
3379 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3380 if (converted == 0)
3381 /* Reached end of string */
3382 break;
3383 if (converted == (size_t)-1 || converted == (size_t)-2) {
3384 /* Conversion error or incomplete character */
3385 return str - start;
3386 }
3387 else {
3388 str += converted;
3389 len -= converted;
3390 }
3391 }
3392 /* failed to find the undecodable byte sequence */
3393 return 0;
3394#endif
3395 return 0;
3396}
3397
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003398PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003399PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003400 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003401{
3402 wchar_t smallbuf[256];
3403 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3404 wchar_t *wstr;
3405 size_t wlen, wlen2;
3406 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003407 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003408 size_t error_pos;
3409 char *errmsg;
3410 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003411
3412 if (locale_error_handler(errors, &surrogateescape) < 0)
3413 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003414
3415 if (str[len] != '\0' || len != strlen(str)) {
3416 PyErr_SetString(PyExc_TypeError, "embedded null character");
3417 return NULL;
3418 }
3419
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003420 if (surrogateescape) {
3421 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003422 wstr = _Py_char2wchar(str, &wlen);
3423 if (wstr == NULL) {
3424 if (wlen == (size_t)-1)
3425 PyErr_NoMemory();
3426 else
3427 PyErr_SetFromErrno(PyExc_OSError);
3428 return NULL;
3429 }
3430
3431 unicode = PyUnicode_FromWideChar(wstr, wlen);
3432 PyMem_Free(wstr);
3433 }
3434 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003435 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003436#ifndef HAVE_BROKEN_MBSTOWCS
3437 wlen = mbstowcs(NULL, str, 0);
3438#else
3439 wlen = len;
3440#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003441 if (wlen == (size_t)-1)
3442 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003443 if (wlen+1 <= smallbuf_len) {
3444 wstr = smallbuf;
3445 }
3446 else {
3447 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3448 return PyErr_NoMemory();
3449
3450 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3451 if (!wstr)
3452 return PyErr_NoMemory();
3453 }
3454
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003455 wlen2 = mbstowcs(wstr, str, wlen+1);
3456 if (wlen2 == (size_t)-1) {
3457 if (wstr != smallbuf)
3458 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003459 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003460 }
3461#ifdef HAVE_BROKEN_MBSTOWCS
3462 assert(wlen2 == wlen);
3463#endif
3464 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3465 if (wstr != smallbuf)
3466 PyMem_Free(wstr);
3467 }
3468 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003469
3470decode_error:
3471 errmsg = strerror(errno);
3472 assert(errmsg != NULL);
3473
3474 error_pos = mbstowcs_errorpos(str, len);
3475 if (errmsg != NULL) {
3476 size_t errlen;
3477 wstr = _Py_char2wchar(errmsg, &errlen);
3478 if (wstr != NULL) {
3479 reason = PyUnicode_FromWideChar(wstr, errlen);
3480 PyMem_Free(wstr);
3481 } else
3482 errmsg = NULL;
3483 }
3484 if (errmsg == NULL)
3485 reason = PyUnicode_FromString(
3486 "mbstowcs() encountered an invalid multibyte sequence");
3487 if (reason == NULL)
3488 return NULL;
3489
3490 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3491 "locale", str, len,
3492 (Py_ssize_t)error_pos,
3493 (Py_ssize_t)(error_pos+1),
3494 reason);
3495 Py_DECREF(reason);
3496 if (exc != NULL) {
3497 PyCodec_StrictErrors(exc);
3498 Py_XDECREF(exc);
3499 }
3500 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003501}
3502
3503PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003504PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003505{
3506 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003507 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003508}
3509
3510
3511PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003512PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003513 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003514 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3515}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003516
Christian Heimes5894ba72007-11-04 11:43:14 +00003517PyObject*
3518PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3519{
Victor Stinner99b95382011-07-04 14:23:54 +02003520#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003521 return PyUnicode_DecodeMBCS(s, size, NULL);
3522#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003523 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003524#else
Victor Stinner793b5312011-04-27 00:24:21 +02003525 PyInterpreterState *interp = PyThreadState_GET()->interp;
3526 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3527 cannot use it to encode and decode filenames before it is loaded. Load
3528 the Python codec requires to encode at least its own filename. Use the C
3529 version of the locale codec until the codec registry is initialized and
3530 the Python codec is loaded.
3531
3532 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3533 cannot only rely on it: check also interp->fscodec_initialized for
3534 subinterpreters. */
3535 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003536 return PyUnicode_Decode(s, size,
3537 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003538 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003539 }
3540 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003541 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003542 }
Victor Stinnerad158722010-10-27 00:25:46 +00003543#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003544}
3545
Martin v. Löwis011e8422009-05-05 04:43:17 +00003546
3547int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003548_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003549{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003550 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003551
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003552 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003553 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003554 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3555 PyUnicode_GET_LENGTH(str), '\0', 1);
3556 if (pos == -1)
3557 return 0;
3558 else
3559 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003560}
3561
Antoine Pitrou13348842012-01-29 18:36:34 +01003562int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003563PyUnicode_FSConverter(PyObject* arg, void* addr)
3564{
3565 PyObject *output = NULL;
3566 Py_ssize_t size;
3567 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003568 if (arg == NULL) {
3569 Py_DECREF(*(PyObject**)addr);
3570 return 1;
3571 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003572 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003573 output = arg;
3574 Py_INCREF(output);
3575 }
3576 else {
3577 arg = PyUnicode_FromObject(arg);
3578 if (!arg)
3579 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003580 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003581 Py_DECREF(arg);
3582 if (!output)
3583 return 0;
3584 if (!PyBytes_Check(output)) {
3585 Py_DECREF(output);
3586 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3587 return 0;
3588 }
3589 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003590 size = PyBytes_GET_SIZE(output);
3591 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003592 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003593 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003594 Py_DECREF(output);
3595 return 0;
3596 }
3597 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003598 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003599}
3600
3601
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003602int
3603PyUnicode_FSDecoder(PyObject* arg, void* addr)
3604{
3605 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003606 if (arg == NULL) {
3607 Py_DECREF(*(PyObject**)addr);
3608 return 1;
3609 }
3610 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003611 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003612 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003613 output = arg;
3614 Py_INCREF(output);
3615 }
3616 else {
3617 arg = PyBytes_FromObject(arg);
3618 if (!arg)
3619 return 0;
3620 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3621 PyBytes_GET_SIZE(arg));
3622 Py_DECREF(arg);
3623 if (!output)
3624 return 0;
3625 if (!PyUnicode_Check(output)) {
3626 Py_DECREF(output);
3627 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3628 return 0;
3629 }
3630 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003631 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003632 Py_DECREF(output);
3633 return 0;
3634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003635 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003636 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003637 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3638 Py_DECREF(output);
3639 return 0;
3640 }
3641 *(PyObject**)addr = output;
3642 return Py_CLEANUP_SUPPORTED;
3643}
3644
3645
Martin v. Löwis5b222132007-06-10 09:51:05 +00003646char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003647PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003648{
Christian Heimesf3863112007-11-22 07:46:41 +00003649 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003650
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003651 if (!PyUnicode_Check(unicode)) {
3652 PyErr_BadArgument();
3653 return NULL;
3654 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003655 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003656 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003657
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003658 if (PyUnicode_UTF8(unicode) == NULL) {
3659 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003660 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3661 if (bytes == NULL)
3662 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003663 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3664 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003665 Py_DECREF(bytes);
3666 return NULL;
3667 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003668 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3669 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3670 PyBytes_AS_STRING(bytes),
3671 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003672 Py_DECREF(bytes);
3673 }
3674
3675 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003676 *psize = PyUnicode_UTF8_LENGTH(unicode);
3677 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003678}
3679
3680char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003681PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003683 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3684}
3685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003686Py_UNICODE *
3687PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3688{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003689 const unsigned char *one_byte;
3690#if SIZEOF_WCHAR_T == 4
3691 const Py_UCS2 *two_bytes;
3692#else
3693 const Py_UCS4 *four_bytes;
3694 const Py_UCS4 *ucs4_end;
3695 Py_ssize_t num_surrogates;
3696#endif
3697 wchar_t *w;
3698 wchar_t *wchar_end;
3699
3700 if (!PyUnicode_Check(unicode)) {
3701 PyErr_BadArgument();
3702 return NULL;
3703 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003704 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003705 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003706 assert(_PyUnicode_KIND(unicode) != 0);
3707 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003708
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003709 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003710#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003711 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3712 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713 num_surrogates = 0;
3714
3715 for (; four_bytes < ucs4_end; ++four_bytes) {
3716 if (*four_bytes > 0xFFFF)
3717 ++num_surrogates;
3718 }
3719
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003720 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3721 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3722 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003723 PyErr_NoMemory();
3724 return NULL;
3725 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003726 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003727
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003728 w = _PyUnicode_WSTR(unicode);
3729 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3730 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003731 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3732 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003733 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003735 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3736 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737 }
3738 else
3739 *w = *four_bytes;
3740
3741 if (w > wchar_end) {
3742 assert(0 && "Miscalculated string end");
3743 }
3744 }
3745 *w = 0;
3746#else
3747 /* sizeof(wchar_t) == 4 */
3748 Py_FatalError("Impossible unicode object state, wstr and str "
3749 "should share memory already.");
3750 return NULL;
3751#endif
3752 }
3753 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003754 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3755 (_PyUnicode_LENGTH(unicode) + 1));
3756 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003757 PyErr_NoMemory();
3758 return NULL;
3759 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003760 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3761 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3762 w = _PyUnicode_WSTR(unicode);
3763 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003765 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3766 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003767 for (; w < wchar_end; ++one_byte, ++w)
3768 *w = *one_byte;
3769 /* null-terminate the wstr */
3770 *w = 0;
3771 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003772 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003775 for (; w < wchar_end; ++two_bytes, ++w)
3776 *w = *two_bytes;
3777 /* null-terminate the wstr */
3778 *w = 0;
3779#else
3780 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003781 PyObject_FREE(_PyUnicode_WSTR(unicode));
3782 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003783 Py_FatalError("Impossible unicode object state, wstr "
3784 "and str should share memory already.");
3785 return NULL;
3786#endif
3787 }
3788 else {
3789 assert(0 && "This should never happen.");
3790 }
3791 }
3792 }
3793 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003794 *size = PyUnicode_WSTR_LENGTH(unicode);
3795 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003796}
3797
Alexander Belopolsky40018472011-02-26 01:02:56 +00003798Py_UNICODE *
3799PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003801 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802}
3803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003804
Alexander Belopolsky40018472011-02-26 01:02:56 +00003805Py_ssize_t
3806PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003807{
3808 if (!PyUnicode_Check(unicode)) {
3809 PyErr_BadArgument();
3810 goto onError;
3811 }
3812 return PyUnicode_GET_SIZE(unicode);
3813
Benjamin Peterson29060642009-01-31 22:14:21 +00003814 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 return -1;
3816}
3817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818Py_ssize_t
3819PyUnicode_GetLength(PyObject *unicode)
3820{
Victor Stinner07621332012-06-16 04:53:46 +02003821 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822 PyErr_BadArgument();
3823 return -1;
3824 }
Victor Stinner07621332012-06-16 04:53:46 +02003825 if (PyUnicode_READY(unicode) == -1)
3826 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827 return PyUnicode_GET_LENGTH(unicode);
3828}
3829
3830Py_UCS4
3831PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3832{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003833 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3834 PyErr_BadArgument();
3835 return (Py_UCS4)-1;
3836 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003837 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003838 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003839 return (Py_UCS4)-1;
3840 }
3841 return PyUnicode_READ_CHAR(unicode, index);
3842}
3843
3844int
3845PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3846{
3847 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003848 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003849 return -1;
3850 }
Victor Stinner488fa492011-12-12 00:01:39 +01003851 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003852 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003853 PyErr_SetString(PyExc_IndexError, "string index out of range");
3854 return -1;
3855 }
Victor Stinner488fa492011-12-12 00:01:39 +01003856 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003857 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003858 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3859 PyErr_SetString(PyExc_ValueError, "character out of range");
3860 return -1;
3861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3863 index, ch);
3864 return 0;
3865}
3866
Alexander Belopolsky40018472011-02-26 01:02:56 +00003867const char *
3868PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003869{
Victor Stinner42cb4622010-09-01 19:39:01 +00003870 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003871}
3872
Victor Stinner554f3f02010-06-16 23:33:54 +00003873/* create or adjust a UnicodeDecodeError */
3874static void
3875make_decode_exception(PyObject **exceptionObject,
3876 const char *encoding,
3877 const char *input, Py_ssize_t length,
3878 Py_ssize_t startpos, Py_ssize_t endpos,
3879 const char *reason)
3880{
3881 if (*exceptionObject == NULL) {
3882 *exceptionObject = PyUnicodeDecodeError_Create(
3883 encoding, input, length, startpos, endpos, reason);
3884 }
3885 else {
3886 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3887 goto onError;
3888 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3889 goto onError;
3890 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3891 goto onError;
3892 }
3893 return;
3894
3895onError:
3896 Py_DECREF(*exceptionObject);
3897 *exceptionObject = NULL;
3898}
3899
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003900#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901/* error handling callback helper:
3902 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003903 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003904 and adjust various state variables.
3905 return 0 on success, -1 on error
3906*/
3907
Alexander Belopolsky40018472011-02-26 01:02:56 +00003908static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003909unicode_decode_call_errorhandler_wchar(
3910 const char *errors, PyObject **errorHandler,
3911 const char *encoding, const char *reason,
3912 const char **input, const char **inend, Py_ssize_t *startinpos,
3913 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3914 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003916 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003917
3918 PyObject *restuple = NULL;
3919 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003920 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003921 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003922 Py_ssize_t requiredsize;
3923 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003924 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003925 wchar_t *repwstr;
3926 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003928 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3929 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003930
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003932 *errorHandler = PyCodec_LookupError(errors);
3933 if (*errorHandler == NULL)
3934 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 }
3936
Victor Stinner554f3f02010-06-16 23:33:54 +00003937 make_decode_exception(exceptionObject,
3938 encoding,
3939 *input, *inend - *input,
3940 *startinpos, *endinpos,
3941 reason);
3942 if (*exceptionObject == NULL)
3943 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944
3945 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3946 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003947 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003949 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 }
3952 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003953 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003954
3955 /* Copy back the bytes variables, which might have been modified by the
3956 callback */
3957 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3958 if (!inputobj)
3959 goto onError;
3960 if (!PyBytes_Check(inputobj)) {
3961 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3962 }
3963 *input = PyBytes_AS_STRING(inputobj);
3964 insize = PyBytes_GET_SIZE(inputobj);
3965 *inend = *input + insize;
3966 /* we can DECREF safely, as the exception has another reference,
3967 so the object won't go away. */
3968 Py_DECREF(inputobj);
3969
3970 if (newpos<0)
3971 newpos = insize+newpos;
3972 if (newpos<0 || newpos>insize) {
3973 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3974 goto onError;
3975 }
3976
3977 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3978 if (repwstr == NULL)
3979 goto onError;
3980 /* need more space? (at least enough for what we
3981 have+the replacement+the rest of the string (starting
3982 at the new input position), so we won't have to check space
3983 when there are no errors in the rest of the string) */
3984 requiredsize = *outpos + repwlen + insize-newpos;
3985 if (requiredsize > outsize) {
3986 if (requiredsize < 2*outsize)
3987 requiredsize = 2*outsize;
3988 if (unicode_resize(output, requiredsize) < 0)
3989 goto onError;
3990 }
3991 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3992 *outpos += repwlen;
3993
3994 *endinpos = newpos;
3995 *inptr = *input + newpos;
3996
3997 /* we made it! */
3998 Py_XDECREF(restuple);
3999 return 0;
4000
4001 onError:
4002 Py_XDECREF(restuple);
4003 return -1;
4004}
4005#endif /* HAVE_MBCS */
4006
4007static int
4008unicode_decode_call_errorhandler_writer(
4009 const char *errors, PyObject **errorHandler,
4010 const char *encoding, const char *reason,
4011 const char **input, const char **inend, Py_ssize_t *startinpos,
4012 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4013 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4014{
4015 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4016
4017 PyObject *restuple = NULL;
4018 PyObject *repunicode = NULL;
4019 Py_ssize_t insize;
4020 Py_ssize_t newpos;
4021 PyObject *inputobj = NULL;
4022
4023 if (*errorHandler == NULL) {
4024 *errorHandler = PyCodec_LookupError(errors);
4025 if (*errorHandler == NULL)
4026 goto onError;
4027 }
4028
4029 make_decode_exception(exceptionObject,
4030 encoding,
4031 *input, *inend - *input,
4032 *startinpos, *endinpos,
4033 reason);
4034 if (*exceptionObject == NULL)
4035 goto onError;
4036
4037 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4038 if (restuple == NULL)
4039 goto onError;
4040 if (!PyTuple_Check(restuple)) {
4041 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4042 goto onError;
4043 }
4044 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004045 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004046
4047 /* Copy back the bytes variables, which might have been modified by the
4048 callback */
4049 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4050 if (!inputobj)
4051 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004052 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004053 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004054 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004055 *input = PyBytes_AS_STRING(inputobj);
4056 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004057 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004058 /* we can DECREF safely, as the exception has another reference,
4059 so the object won't go away. */
4060 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004061
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004064 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4066 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004067 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004069 writer->overallocate = 1;
4070 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4071 return
4072
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004074 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004075
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004077 Py_XDECREF(restuple);
4078 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079
Benjamin Peterson29060642009-01-31 22:14:21 +00004080 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004082 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083}
4084
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004085/* --- UTF-7 Codec -------------------------------------------------------- */
4086
Antoine Pitrou244651a2009-05-04 18:56:13 +00004087/* See RFC2152 for details. We encode conservatively and decode liberally. */
4088
4089/* Three simple macros defining base-64. */
4090
4091/* Is c a base-64 character? */
4092
4093#define IS_BASE64(c) \
4094 (((c) >= 'A' && (c) <= 'Z') || \
4095 ((c) >= 'a' && (c) <= 'z') || \
4096 ((c) >= '0' && (c) <= '9') || \
4097 (c) == '+' || (c) == '/')
4098
4099/* given that c is a base-64 character, what is its base-64 value? */
4100
4101#define FROM_BASE64(c) \
4102 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4103 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4104 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4105 (c) == '+' ? 62 : 63)
4106
4107/* What is the base-64 character of the bottom 6 bits of n? */
4108
4109#define TO_BASE64(n) \
4110 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4111
4112/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4113 * decoded as itself. We are permissive on decoding; the only ASCII
4114 * byte not decoding to itself is the + which begins a base64
4115 * string. */
4116
4117#define DECODE_DIRECT(c) \
4118 ((c) <= 127 && (c) != '+')
4119
4120/* The UTF-7 encoder treats ASCII characters differently according to
4121 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4122 * the above). See RFC2152. This array identifies these different
4123 * sets:
4124 * 0 : "Set D"
4125 * alphanumeric and '(),-./:?
4126 * 1 : "Set O"
4127 * !"#$%&*;<=>@[]^_`{|}
4128 * 2 : "whitespace"
4129 * ht nl cr sp
4130 * 3 : special (must be base64 encoded)
4131 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4132 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004133
Tim Petersced69f82003-09-16 20:30:58 +00004134static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004135char utf7_category[128] = {
4136/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4137 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4138/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4139 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4140/* sp ! " # $ % & ' ( ) * + , - . / */
4141 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4142/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4144/* @ A B C D E F G H I J K L M N O */
4145 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4146/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4148/* ` a b c d e f g h i j k l m n o */
4149 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4150/* p q r s t u v w x y z { | } ~ del */
4151 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004152};
4153
Antoine Pitrou244651a2009-05-04 18:56:13 +00004154/* ENCODE_DIRECT: this character should be encoded as itself. The
4155 * answer depends on whether we are encoding set O as itself, and also
4156 * on whether we are encoding whitespace as itself. RFC2152 makes it
4157 * clear that the answers to these questions vary between
4158 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004159
Antoine Pitrou244651a2009-05-04 18:56:13 +00004160#define ENCODE_DIRECT(c, directO, directWS) \
4161 ((c) < 128 && (c) > 0 && \
4162 ((utf7_category[(c)] == 0) || \
4163 (directWS && (utf7_category[(c)] == 2)) || \
4164 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004165
Alexander Belopolsky40018472011-02-26 01:02:56 +00004166PyObject *
4167PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004168 Py_ssize_t size,
4169 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004170{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004171 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4172}
4173
Antoine Pitrou244651a2009-05-04 18:56:13 +00004174/* The decoder. The only state we preserve is our read position,
4175 * i.e. how many characters we have consumed. So if we end in the
4176 * middle of a shift sequence we have to back off the read position
4177 * and the output to the beginning of the sequence, otherwise we lose
4178 * all the shift state (seen bits, number of bits seen, high
4179 * surrogate). */
4180
Alexander Belopolsky40018472011-02-26 01:02:56 +00004181PyObject *
4182PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004183 Py_ssize_t size,
4184 const char *errors,
4185 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004188 Py_ssize_t startinpos;
4189 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004190 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004191 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004192 const char *errmsg = "";
4193 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004194 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004195 unsigned int base64bits = 0;
4196 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004197 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 PyObject *errorHandler = NULL;
4199 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004200
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004201 if (size == 0) {
4202 if (consumed)
4203 *consumed = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004204 Py_INCREF(unicode_empty);
4205 return unicode_empty;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004206 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004207
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004208 /* Start off assuming it's all ASCII. Widen later as necessary. */
4209 _PyUnicodeWriter_Init(&writer, 0);
4210 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4211 goto onError;
4212
4213 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004214 e = s + size;
4215
4216 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004217 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004218 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004219 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004220
Antoine Pitrou244651a2009-05-04 18:56:13 +00004221 if (inShift) { /* in a base-64 section */
4222 if (IS_BASE64(ch)) { /* consume a base-64 character */
4223 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4224 base64bits += 6;
4225 s++;
4226 if (base64bits >= 16) {
4227 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004228 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004229 base64bits -= 16;
4230 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4231 if (surrogate) {
4232 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004233 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4234 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004235 if (_PyUnicodeWriter_Prepare(&writer, 1, ch2) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004236 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004237 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch2);
4238 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004239 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004240 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004241 }
4242 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004243 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004244 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004245 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4246 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004247 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004248 }
4249 }
Victor Stinner551ac952011-11-29 22:58:13 +01004250 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004251 /* first surrogate */
4252 surrogate = outCh;
4253 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004254 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004255 if (_PyUnicodeWriter_Prepare(&writer, 1, outCh) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004256 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004257 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, outCh);
4258 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004259 }
4260 }
4261 }
4262 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004263 inShift = 0;
4264 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004265 if (surrogate) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004266 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004267 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004268 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4269 writer.pos++;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004270 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004271 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004272 if (base64bits > 0) { /* left-over bits */
4273 if (base64bits >= 6) {
4274 /* We've seen at least one base-64 character */
4275 errmsg = "partial character in shift sequence";
4276 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004277 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004278 else {
4279 /* Some bits remain; they should be zero */
4280 if (base64buffer != 0) {
4281 errmsg = "non-zero padding bits in shift sequence";
4282 goto utf7Error;
4283 }
4284 }
4285 }
4286 if (ch != '-') {
4287 /* '-' is absorbed; other terminating
4288 characters are preserved */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004289 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004290 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004291 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4292 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004293 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004294 }
4295 }
4296 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004298 s++; /* consume '+' */
4299 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004300 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004301 if (_PyUnicodeWriter_Prepare(&writer, 1, '+') == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004302 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '+');
4304 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004305 }
4306 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004307 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004308 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004310 }
4311 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004312 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004313 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004314 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
4315 goto onError;
4316 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4317 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004318 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 else {
4320 startinpos = s-starts;
4321 s++;
4322 errmsg = "unexpected special character";
4323 goto utf7Error;
4324 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004325 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004326utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 errors, &errorHandler,
4330 "utf7", errmsg,
4331 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004332 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004333 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334 }
4335
Antoine Pitrou244651a2009-05-04 18:56:13 +00004336 /* end of string */
4337
4338 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4339 /* if we're in an inconsistent state, that's an error */
4340 if (surrogate ||
4341 (base64bits >= 6) ||
4342 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 errors, &errorHandler,
4346 "utf7", "unterminated shift sequence",
4347 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004348 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004349 goto onError;
4350 if (s < e)
4351 goto restart;
4352 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354
4355 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004356 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004357 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004358 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004359 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 }
4361 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004362 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004364 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366 Py_XDECREF(errorHandler);
4367 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004368 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004369
Benjamin Peterson29060642009-01-31 22:14:21 +00004370 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 Py_XDECREF(errorHandler);
4372 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004373 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374 return NULL;
4375}
4376
4377
Alexander Belopolsky40018472011-02-26 01:02:56 +00004378PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004379_PyUnicode_EncodeUTF7(PyObject *str,
4380 int base64SetO,
4381 int base64WhiteSpace,
4382 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004383{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004384 int kind;
4385 void *data;
4386 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004387 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004388 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004389 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390 unsigned int base64bits = 0;
4391 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004392 char * out;
4393 char * start;
4394
Benjamin Petersonbac79492012-01-14 13:34:47 -05004395 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004396 return NULL;
4397 kind = PyUnicode_KIND(str);
4398 data = PyUnicode_DATA(str);
4399 len = PyUnicode_GET_LENGTH(str);
4400
4401 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004402 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004403
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004404 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004405 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004406 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004407 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004408 if (v == NULL)
4409 return NULL;
4410
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004411 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004412 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004413 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 if (inShift) {
4416 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4417 /* shifting out */
4418 if (base64bits) { /* output remaining bits */
4419 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4420 base64buffer = 0;
4421 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004422 }
4423 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 /* Characters not in the BASE64 set implicitly unshift the sequence
4425 so no '-' is required, except if the character is itself a '-' */
4426 if (IS_BASE64(ch) || ch == '-') {
4427 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 *out++ = (char) ch;
4430 }
4431 else {
4432 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004433 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004434 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004435 else { /* not in a shift sequence */
4436 if (ch == '+') {
4437 *out++ = '+';
4438 *out++ = '-';
4439 }
4440 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4441 *out++ = (char) ch;
4442 }
4443 else {
4444 *out++ = '+';
4445 inShift = 1;
4446 goto encode_char;
4447 }
4448 }
4449 continue;
4450encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004452 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004453
Antoine Pitrou244651a2009-05-04 18:56:13 +00004454 /* code first surrogate */
4455 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004456 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004457 while (base64bits >= 6) {
4458 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4459 base64bits -= 6;
4460 }
4461 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004462 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004463 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 base64bits += 16;
4465 base64buffer = (base64buffer << 16) | ch;
4466 while (base64bits >= 6) {
4467 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4468 base64bits -= 6;
4469 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471 if (base64bits)
4472 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4473 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004474 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004475 if (_PyBytes_Resize(&v, out - start) < 0)
4476 return NULL;
4477 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004479PyObject *
4480PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4481 Py_ssize_t size,
4482 int base64SetO,
4483 int base64WhiteSpace,
4484 const char *errors)
4485{
4486 PyObject *result;
4487 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4488 if (tmp == NULL)
4489 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004490 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004491 base64WhiteSpace, errors);
4492 Py_DECREF(tmp);
4493 return result;
4494}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004495
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496#undef IS_BASE64
4497#undef FROM_BASE64
4498#undef TO_BASE64
4499#undef DECODE_DIRECT
4500#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004501
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502/* --- UTF-8 Codec -------------------------------------------------------- */
4503
Alexander Belopolsky40018472011-02-26 01:02:56 +00004504PyObject *
4505PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004506 Py_ssize_t size,
4507 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508{
Walter Dörwald69652032004-09-07 20:24:22 +00004509 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4510}
4511
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004512#include "stringlib/asciilib.h"
4513#include "stringlib/codecs.h"
4514#include "stringlib/undef.h"
4515
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004516#include "stringlib/ucs1lib.h"
4517#include "stringlib/codecs.h"
4518#include "stringlib/undef.h"
4519
4520#include "stringlib/ucs2lib.h"
4521#include "stringlib/codecs.h"
4522#include "stringlib/undef.h"
4523
4524#include "stringlib/ucs4lib.h"
4525#include "stringlib/codecs.h"
4526#include "stringlib/undef.h"
4527
Antoine Pitrouab868312009-01-10 15:40:25 +00004528/* Mask to quickly check whether a C 'long' contains a
4529 non-ASCII, UTF8-encoded char. */
4530#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004531# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004532#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004533# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004534#else
4535# error C 'long' size should be either 4 or 8!
4536#endif
4537
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004538static Py_ssize_t
4539ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004540{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004541 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004542 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004543
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004544#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004545 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4546 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004547 /* Fast path, see in STRINGLIB(utf8_decode) for
4548 an explanation. */
4549 /* Help register allocation */
4550 register const char *_p = p;
4551 register Py_UCS1 * q = dest;
4552 while (_p < aligned_end) {
4553 unsigned long value = *(const unsigned long *) _p;
4554 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004555 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004556 *((unsigned long *)q) = value;
4557 _p += SIZEOF_LONG;
4558 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004559 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004560 p = _p;
4561 while (p < end) {
4562 if ((unsigned char)*p & 0x80)
4563 break;
4564 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004566 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004568#endif
4569 while (p < end) {
4570 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4571 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004572 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004573 /* Help register allocation */
4574 register const char *_p = p;
4575 while (_p < aligned_end) {
4576 unsigned long value = *(unsigned long *) _p;
4577 if (value & ASCII_CHAR_MASK)
4578 break;
4579 _p += SIZEOF_LONG;
4580 }
4581 p = _p;
4582 if (_p == end)
4583 break;
4584 }
4585 if ((unsigned char)*p & 0x80)
4586 break;
4587 ++p;
4588 }
4589 memcpy(dest, start, p - start);
4590 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591}
Antoine Pitrouab868312009-01-10 15:40:25 +00004592
Victor Stinner785938e2011-12-11 20:09:03 +01004593PyObject *
4594PyUnicode_DecodeUTF8Stateful(const char *s,
4595 Py_ssize_t size,
4596 const char *errors,
4597 Py_ssize_t *consumed)
4598{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004599 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004600 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004601 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004602
4603 Py_ssize_t startinpos;
4604 Py_ssize_t endinpos;
4605 const char *errmsg = "";
4606 PyObject *errorHandler = NULL;
4607 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004608
4609 if (size == 0) {
4610 if (consumed)
4611 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004612 Py_INCREF(unicode_empty);
4613 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004614 }
4615
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004616 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4617 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004618 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004619 *consumed = 1;
4620 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004621 }
4622
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004623 _PyUnicodeWriter_Init(&writer, 0);
4624 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4625 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004626
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004627 writer.pos = ascii_decode(s, end, writer.data);
4628 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004629 while (s < end) {
4630 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004631 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004632 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004633 if (PyUnicode_IS_ASCII(writer.buffer))
4634 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004635 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004636 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004637 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004638 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004639 } else {
4640 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004641 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004642 }
4643
4644 switch (ch) {
4645 case 0:
4646 if (s == end || consumed)
4647 goto End;
4648 errmsg = "unexpected end of data";
4649 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004650 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004651 break;
4652 case 1:
4653 errmsg = "invalid start byte";
4654 startinpos = s - starts;
4655 endinpos = startinpos + 1;
4656 break;
4657 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004658 case 3:
4659 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004660 errmsg = "invalid continuation byte";
4661 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004662 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004663 break;
4664 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004665 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004666 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004667 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4668 writer.pos++;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004669 continue;
4670 }
4671
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004672 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004673 errors, &errorHandler,
4674 "utf-8", errmsg,
4675 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004676 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004678 }
4679
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004680End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004681 if (consumed)
4682 *consumed = s - starts;
4683
4684 Py_XDECREF(errorHandler);
4685 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004686 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004687
4688onError:
4689 Py_XDECREF(errorHandler);
4690 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004691 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004692 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004693}
4694
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004695#ifdef __APPLE__
4696
4697/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004698 used to decode the command line arguments on Mac OS X.
4699
4700 Return a pointer to a newly allocated wide character string (use
4701 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004702
4703wchar_t*
4704_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4705{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004706 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004707 wchar_t *unicode;
4708 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004709
4710 /* Note: size will always be longer than the resulting Unicode
4711 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004712 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004713 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004714 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4715 if (!unicode)
4716 return NULL;
4717
4718 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004719 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004721 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004723#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004724 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004725#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004726 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004727#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004728 if (ch > 0xFF) {
4729#if SIZEOF_WCHAR_T == 4
4730 assert(0);
4731#else
4732 assert(Py_UNICODE_IS_SURROGATE(ch));
4733 /* compute and append the two surrogates: */
4734 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4735 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4736#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004737 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004738 else {
4739 if (!ch && s == e)
4740 break;
4741 /* surrogateescape */
4742 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4743 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004744 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004746 return unicode;
4747}
4748
4749#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004751/* Primary internal function which creates utf8 encoded bytes objects.
4752
4753 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004754 and allocate exactly as much space needed at the end. Else allocate the
4755 maximum possible needed (4 result bytes per Unicode character), and return
4756 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004757*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004758PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004759_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760{
Victor Stinner6099a032011-12-18 14:22:26 +01004761 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004762 void *data;
4763 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004765 if (!PyUnicode_Check(unicode)) {
4766 PyErr_BadArgument();
4767 return NULL;
4768 }
4769
4770 if (PyUnicode_READY(unicode) == -1)
4771 return NULL;
4772
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004773 if (PyUnicode_UTF8(unicode))
4774 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4775 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004776
4777 kind = PyUnicode_KIND(unicode);
4778 data = PyUnicode_DATA(unicode);
4779 size = PyUnicode_GET_LENGTH(unicode);
4780
Benjamin Petersonead6b532011-12-20 17:23:42 -06004781 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004782 default:
4783 assert(0);
4784 case PyUnicode_1BYTE_KIND:
4785 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4786 assert(!PyUnicode_IS_ASCII(unicode));
4787 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4788 case PyUnicode_2BYTE_KIND:
4789 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4790 case PyUnicode_4BYTE_KIND:
4791 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793}
4794
Alexander Belopolsky40018472011-02-26 01:02:56 +00004795PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004796PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4797 Py_ssize_t size,
4798 const char *errors)
4799{
4800 PyObject *v, *unicode;
4801
4802 unicode = PyUnicode_FromUnicode(s, size);
4803 if (unicode == NULL)
4804 return NULL;
4805 v = _PyUnicode_AsUTF8String(unicode, errors);
4806 Py_DECREF(unicode);
4807 return v;
4808}
4809
4810PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004811PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004813 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814}
4815
Walter Dörwald41980ca2007-08-16 21:55:45 +00004816/* --- UTF-32 Codec ------------------------------------------------------- */
4817
4818PyObject *
4819PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004820 Py_ssize_t size,
4821 const char *errors,
4822 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004823{
4824 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4825}
4826
4827PyObject *
4828PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004829 Py_ssize_t size,
4830 const char *errors,
4831 int *byteorder,
4832 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004833{
4834 const char *starts = s;
4835 Py_ssize_t startinpos;
4836 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004837 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004838 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004839 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004840 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004841 PyObject *errorHandler = NULL;
4842 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004843
Walter Dörwald41980ca2007-08-16 21:55:45 +00004844 q = (unsigned char *)s;
4845 e = q + size;
4846
4847 if (byteorder)
4848 bo = *byteorder;
4849
4850 /* Check for BOM marks (U+FEFF) in the input and adjust current
4851 byte order setting accordingly. In native mode, the leading BOM
4852 mark is skipped, in all other modes, it is copied to the output
4853 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004854 if (bo == 0 && size >= 4) {
4855 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4856 if (bom == 0x0000FEFF) {
4857 bo = -1;
4858 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004859 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004860 else if (bom == 0xFFFE0000) {
4861 bo = 1;
4862 q += 4;
4863 }
4864 if (byteorder)
4865 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004866 }
4867
Victor Stinnere64322e2012-10-30 23:12:47 +01004868 if (q == e) {
4869 if (consumed)
4870 *consumed = size;
4871 Py_INCREF(unicode_empty);
4872 return unicode_empty;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004873 }
4874
Victor Stinnere64322e2012-10-30 23:12:47 +01004875#ifdef WORDS_BIGENDIAN
4876 le = bo < 0;
4877#else
4878 le = bo <= 0;
4879#endif
4880
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004881 _PyUnicodeWriter_Init(&writer, 0);
4882 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
4883 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004884
Victor Stinnere64322e2012-10-30 23:12:47 +01004885 while (1) {
4886 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004887 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004888
Victor Stinnere64322e2012-10-30 23:12:47 +01004889 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004890 enum PyUnicode_Kind kind = writer.kind;
4891 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004892 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004893 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004894 if (le) {
4895 do {
4896 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4897 if (ch > maxch)
4898 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004899 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004900 q += 4;
4901 } while (q <= last);
4902 }
4903 else {
4904 do {
4905 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4906 if (ch > maxch)
4907 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004908 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004909 q += 4;
4910 } while (q <= last);
4911 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004912 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004913 }
4914
4915 if (ch <= maxch) {
4916 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004917 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01004918 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00004919 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01004920 startinpos = ((const char *)q) - starts;
4921 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004923 else {
4924 if (ch < 0x110000) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004925 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinnere64322e2012-10-30 23:12:47 +01004926 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004927 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4928 writer.pos++;
Victor Stinnere64322e2012-10-30 23:12:47 +01004929 q += 4;
4930 continue;
4931 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004932 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01004933 startinpos = ((const char *)q) - starts;
4934 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004935 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004936
4937 /* The remaining input chars are ignored if the callback
4938 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004939 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 errors, &errorHandler,
4941 "utf32", errmsg,
4942 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004943 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004944 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945 }
4946
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004948 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004949
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950 Py_XDECREF(errorHandler);
4951 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004952 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004953
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004955 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004956 Py_XDECREF(errorHandler);
4957 Py_XDECREF(exc);
4958 return NULL;
4959}
4960
4961PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004962_PyUnicode_EncodeUTF32(PyObject *str,
4963 const char *errors,
4964 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004965{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004966 int kind;
4967 void *data;
4968 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004969 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004970 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004971 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004972 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02004973#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00004974 int iorder[] = {0, 1, 2, 3};
4975#else
4976 int iorder[] = {3, 2, 1, 0};
4977#endif
4978
Benjamin Peterson29060642009-01-31 22:14:21 +00004979#define STORECHAR(CH) \
4980 do { \
4981 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4982 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4983 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4984 p[iorder[0]] = (CH) & 0xff; \
4985 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004986 } while(0)
4987
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004988 if (!PyUnicode_Check(str)) {
4989 PyErr_BadArgument();
4990 return NULL;
4991 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004992 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004993 return NULL;
4994 kind = PyUnicode_KIND(str);
4995 data = PyUnicode_DATA(str);
4996 len = PyUnicode_GET_LENGTH(str);
4997
4998 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004999 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005000 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005001 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005002 if (v == NULL)
5003 return NULL;
5004
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005005 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005006 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005008 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005009 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005010
5011 if (byteorder == -1) {
5012 /* force LE */
5013 iorder[0] = 0;
5014 iorder[1] = 1;
5015 iorder[2] = 2;
5016 iorder[3] = 3;
5017 }
5018 else if (byteorder == 1) {
5019 /* force BE */
5020 iorder[0] = 3;
5021 iorder[1] = 2;
5022 iorder[2] = 1;
5023 iorder[3] = 0;
5024 }
5025
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005026 for (i = 0; i < len; i++)
5027 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005028
5029 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005030 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005031#undef STORECHAR
5032}
5033
Alexander Belopolsky40018472011-02-26 01:02:56 +00005034PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005035PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5036 Py_ssize_t size,
5037 const char *errors,
5038 int byteorder)
5039{
5040 PyObject *result;
5041 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5042 if (tmp == NULL)
5043 return NULL;
5044 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5045 Py_DECREF(tmp);
5046 return result;
5047}
5048
5049PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005050PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051{
Victor Stinnerb960b342011-11-20 19:12:52 +01005052 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053}
5054
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055/* --- UTF-16 Codec ------------------------------------------------------- */
5056
Tim Peters772747b2001-08-09 22:21:55 +00005057PyObject *
5058PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005059 Py_ssize_t size,
5060 const char *errors,
5061 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062{
Walter Dörwald69652032004-09-07 20:24:22 +00005063 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5064}
5065
5066PyObject *
5067PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 Py_ssize_t size,
5069 const char *errors,
5070 int *byteorder,
5071 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005072{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005073 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005074 Py_ssize_t startinpos;
5075 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005076 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005077 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005078 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005079 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005080 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005081 PyObject *errorHandler = NULL;
5082 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083
Tim Peters772747b2001-08-09 22:21:55 +00005084 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005085 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086
5087 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005088 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005090 /* Check for BOM marks (U+FEFF) in the input and adjust current
5091 byte order setting accordingly. In native mode, the leading BOM
5092 mark is skipped, in all other modes, it is copied to the output
5093 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005094 if (bo == 0 && size >= 2) {
5095 const Py_UCS4 bom = (q[1] << 8) | q[0];
5096 if (bom == 0xFEFF) {
5097 q += 2;
5098 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005100 else if (bom == 0xFFFE) {
5101 q += 2;
5102 bo = 1;
5103 }
5104 if (byteorder)
5105 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107
Antoine Pitrou63065d72012-05-15 23:48:04 +02005108 if (q == e) {
5109 if (consumed)
5110 *consumed = size;
5111 Py_INCREF(unicode_empty);
5112 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005113 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005114
Christian Heimes743e0cd2012-10-17 23:52:17 +02005115#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005116 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005117#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005118 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005119#endif
Tim Peters772747b2001-08-09 22:21:55 +00005120
Antoine Pitrou63065d72012-05-15 23:48:04 +02005121 /* Note: size will always be longer than the resulting Unicode
5122 character count */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005123 _PyUnicodeWriter_Init(&writer, 0);
5124 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
5125 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005126
Antoine Pitrou63065d72012-05-15 23:48:04 +02005127 while (1) {
5128 Py_UCS4 ch = 0;
5129 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005130 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005131 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005132 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005133 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005134 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005135 native_ordering);
5136 else
5137 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005138 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005139 native_ordering);
5140 } else if (kind == PyUnicode_2BYTE_KIND) {
5141 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005142 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005143 native_ordering);
5144 } else {
5145 assert(kind == PyUnicode_4BYTE_KIND);
5146 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005147 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005148 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005149 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005150 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005151
Antoine Pitrou63065d72012-05-15 23:48:04 +02005152 switch (ch)
5153 {
5154 case 0:
5155 /* remaining byte at the end? (size should be even) */
5156 if (q == e || consumed)
5157 goto End;
5158 errmsg = "truncated data";
5159 startinpos = ((const char *)q) - starts;
5160 endinpos = ((const char *)e) - starts;
5161 break;
5162 /* The remaining input chars are ignored if the callback
5163 chooses to skip the input */
5164 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005165 q -= 2;
5166 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005167 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005168 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005169 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005170 endinpos = ((const char *)e) - starts;
5171 break;
5172 case 2:
5173 errmsg = "illegal encoding";
5174 startinpos = ((const char *)q) - 2 - starts;
5175 endinpos = startinpos + 2;
5176 break;
5177 case 3:
5178 errmsg = "illegal UTF-16 surrogate";
5179 startinpos = ((const char *)q) - 4 - starts;
5180 endinpos = startinpos + 2;
5181 break;
5182 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005183 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005184 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005185 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
5186 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005187 continue;
5188 }
5189
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005190 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005191 errors,
5192 &errorHandler,
5193 "utf16", errmsg,
5194 &starts,
5195 (const char **)&e,
5196 &startinpos,
5197 &endinpos,
5198 &exc,
5199 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005200 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 }
5203
Antoine Pitrou63065d72012-05-15 23:48:04 +02005204End:
Walter Dörwald69652032004-09-07 20:24:22 +00005205 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005207
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005208 Py_XDECREF(errorHandler);
5209 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005210 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005213 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 Py_XDECREF(errorHandler);
5215 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 return NULL;
5217}
5218
Tim Peters772747b2001-08-09 22:21:55 +00005219PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005220_PyUnicode_EncodeUTF16(PyObject *str,
5221 const char *errors,
5222 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005224 enum PyUnicode_Kind kind;
5225 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005226 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005227 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005228 unsigned short *out;
5229 Py_ssize_t bytesize;
5230 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005231#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005232 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005233#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005234 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005235#endif
5236
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005237 if (!PyUnicode_Check(str)) {
5238 PyErr_BadArgument();
5239 return NULL;
5240 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005241 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005242 return NULL;
5243 kind = PyUnicode_KIND(str);
5244 data = PyUnicode_DATA(str);
5245 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005246
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005247 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005248 if (kind == PyUnicode_4BYTE_KIND) {
5249 const Py_UCS4 *in = (const Py_UCS4 *)data;
5250 const Py_UCS4 *end = in + len;
5251 while (in < end)
5252 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005253 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005254 }
5255 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005256 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005257 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005258 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 if (v == NULL)
5260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005262 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005263 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005264 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005266 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005267 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005268 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005269
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005270 switch (kind) {
5271 case PyUnicode_1BYTE_KIND: {
5272 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5273 break;
Tim Peters772747b2001-08-09 22:21:55 +00005274 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005275 case PyUnicode_2BYTE_KIND: {
5276 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5277 break;
Tim Peters772747b2001-08-09 22:21:55 +00005278 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005279 case PyUnicode_4BYTE_KIND: {
5280 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5281 break;
5282 }
5283 default:
5284 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005285 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005286
5287 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005288 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289}
5290
Alexander Belopolsky40018472011-02-26 01:02:56 +00005291PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005292PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5293 Py_ssize_t size,
5294 const char *errors,
5295 int byteorder)
5296{
5297 PyObject *result;
5298 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5299 if (tmp == NULL)
5300 return NULL;
5301 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5302 Py_DECREF(tmp);
5303 return result;
5304}
5305
5306PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005307PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005309 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310}
5311
5312/* --- Unicode Escape Codec ----------------------------------------------- */
5313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005314/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5315 if all the escapes in the string make it still a valid ASCII string.
5316 Returns -1 if any escapes were found which cause the string to
5317 pop out of ASCII range. Otherwise returns the length of the
5318 required buffer to hold the string.
5319 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005320static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005321length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5322{
5323 const unsigned char *p = (const unsigned char *)s;
5324 const unsigned char *end = p + size;
5325 Py_ssize_t length = 0;
5326
5327 if (size < 0)
5328 return -1;
5329
5330 for (; p < end; ++p) {
5331 if (*p > 127) {
5332 /* Non-ASCII */
5333 return -1;
5334 }
5335 else if (*p != '\\') {
5336 /* Normal character */
5337 ++length;
5338 }
5339 else {
5340 /* Backslash-escape, check next char */
5341 ++p;
5342 /* Escape sequence reaches till end of string or
5343 non-ASCII follow-up. */
5344 if (p >= end || *p > 127)
5345 return -1;
5346 switch (*p) {
5347 case '\n':
5348 /* backslash + \n result in zero characters */
5349 break;
5350 case '\\': case '\'': case '\"':
5351 case 'b': case 'f': case 't':
5352 case 'n': case 'r': case 'v': case 'a':
5353 ++length;
5354 break;
5355 case '0': case '1': case '2': case '3':
5356 case '4': case '5': case '6': case '7':
5357 case 'x': case 'u': case 'U': case 'N':
5358 /* these do not guarantee ASCII characters */
5359 return -1;
5360 default:
5361 /* count the backslash + the other character */
5362 length += 2;
5363 }
5364 }
5365 }
5366 return length;
5367}
5368
Fredrik Lundh06d12682001-01-24 07:59:11 +00005369static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005370
Alexander Belopolsky40018472011-02-26 01:02:56 +00005371PyObject *
5372PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005373 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005374 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005376 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005377 Py_ssize_t startinpos;
5378 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005379 int j;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005380 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005382 char* message;
5383 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384 PyObject *errorHandler = NULL;
5385 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005386 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005387
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005388 len = length_of_escaped_ascii_string(s, size);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005389 if (len == 0) {
5390 Py_INCREF(unicode_empty);
5391 return unicode_empty;
5392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005393
5394 /* After length_of_escaped_ascii_string() there are two alternatives,
5395 either the string is pure ASCII with named escapes like \n, etc.
5396 and we determined it's exact size (common case)
5397 or it contains \x, \u, ... escape sequences. then we create a
5398 legacy wchar string and resize it at the end of this function. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005399 _PyUnicodeWriter_Init(&writer, 0);
5400 if (len > 0) {
5401 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005402 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005403 assert(writer.kind == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005404 }
5405 else {
5406 /* Escaped strings will always be longer than the resulting
5407 Unicode string, so we start with size here and then reduce the
5408 length after conversion to the true value.
5409 (but if the error callback returns a long replacement string
5410 we'll have to allocate more space) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005411 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005412 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005413 }
5414
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005416 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005418
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 while (s < end) {
5420 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005421 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005422 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423
5424 /* Non-escape characters are interpreted as Unicode ordinals */
5425 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005426 x = (unsigned char)*s;
5427 s++;
5428 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005429 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005430 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5431 writer.pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 continue;
5433 }
5434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005435 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 /* \ - Escapes */
5437 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005438 c = *s++;
5439 if (s > end)
5440 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005441
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005442 /* The only case in which i == ascii_length is a backslash
5443 followed by a newline. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005444 assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005445
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005446 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005449#define WRITECHAR(ch) \
5450 do { \
5451 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) \
5452 goto onError; \
5453 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); \
5454 writer.pos++; \
5455 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005456
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005458 case '\\': WRITECHAR('\\'); break;
5459 case '\'': WRITECHAR('\''); break;
5460 case '\"': WRITECHAR('\"'); break;
5461 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005462 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005463 case 'f': WRITECHAR('\014'); break;
5464 case 't': WRITECHAR('\t'); break;
5465 case 'n': WRITECHAR('\n'); break;
5466 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005467 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005468 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005469 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005470 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 case '0': case '1': case '2': case '3':
5474 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005475 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005476 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005477 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005478 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005479 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005481 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 break;
5483
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 /* hex escapes */
5485 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005487 digits = 2;
5488 message = "truncated \\xXX escape";
5489 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005493 digits = 4;
5494 message = "truncated \\uXXXX escape";
5495 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005498 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005499 digits = 8;
5500 message = "truncated \\UXXXXXXXX escape";
5501 hexescape:
5502 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005503 if (s+digits>end) {
5504 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005505 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 errors, &errorHandler,
5507 "unicodeescape", "end of string in escape sequence",
5508 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005509 &writer))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510 goto onError;
5511 goto nextByte;
5512 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005513 for (j = 0; j < digits; ++j) {
5514 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005515 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005516 endinpos = (s+j+1)-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005517 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 errors, &errorHandler,
5519 "unicodeescape", message,
5520 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005521 &writer))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005522 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005523 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005524 }
5525 chr = (chr<<4) & ~0xF;
5526 if (c >= '0' && c <= '9')
5527 chr += c - '0';
5528 else if (c >= 'a' && c <= 'f')
5529 chr += 10 + c - 'a';
5530 else
5531 chr += 10 + c - 'A';
5532 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005533 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005534 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005535 /* _decoding_error will have already written into the
5536 target buffer. */
5537 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005538 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005539 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005540 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005541 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005542 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005543 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005544 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005545 errors, &errorHandler,
5546 "unicodeescape", "illegal Unicode character",
5547 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005548 &writer))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005549 goto onError;
5550 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005551 break;
5552
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005554 case 'N':
5555 message = "malformed \\N character escape";
5556 if (ucnhash_CAPI == NULL) {
5557 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5559 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005560 if (ucnhash_CAPI == NULL)
5561 goto ucnhashError;
5562 }
5563 if (*s == '{') {
5564 const char *start = s+1;
5565 /* look for the closing brace */
5566 while (*s != '}' && s < end)
5567 s++;
5568 if (s > start && s < end && *s == '}') {
5569 /* found a name. look it up in the unicode database */
5570 message = "unknown Unicode character name";
5571 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005572 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005573 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005574 goto store;
5575 }
5576 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005577 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005578 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 errors, &errorHandler,
5580 "unicodeescape", message,
5581 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005582 &writer))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005583 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005584 break;
5585
5586 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005587 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005588 message = "\\ at end of string";
5589 s--;
5590 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005591 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 errors, &errorHandler,
5593 "unicodeescape", message,
5594 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005595 &writer))
Walter Dörwald8c077222002-03-25 11:16:18 +00005596 goto onError;
5597 }
5598 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005599 WRITECHAR('\\');
5600 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005601 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005602 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005605 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005607#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005608
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005609 Py_XDECREF(errorHandler);
5610 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005611 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005612
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005614 PyErr_SetString(
5615 PyExc_UnicodeError,
5616 "\\N escapes not supported (can't load unicodedata module)"
5617 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005618 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 Py_XDECREF(errorHandler);
5620 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005621 return NULL;
5622
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005624 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 Py_XDECREF(errorHandler);
5626 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 return NULL;
5628}
5629
5630/* Return a Unicode-Escape string version of the Unicode object.
5631
5632 If quotes is true, the string is enclosed in u"" or u'' quotes as
5633 appropriate.
5634
5635*/
5636
Alexander Belopolsky40018472011-02-26 01:02:56 +00005637PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005638PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005640 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005641 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005643 int kind;
5644 void *data;
5645 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646
Ezio Melottie7f90372012-10-05 03:33:31 +03005647 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005648 escape.
5649
Ezio Melottie7f90372012-10-05 03:33:31 +03005650 For UCS1 strings it's '\xxx', 4 bytes per source character.
5651 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5652 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005653 */
5654
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005655 if (!PyUnicode_Check(unicode)) {
5656 PyErr_BadArgument();
5657 return NULL;
5658 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005659 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005660 return NULL;
5661 len = PyUnicode_GET_LENGTH(unicode);
5662 kind = PyUnicode_KIND(unicode);
5663 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005664 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005665 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5666 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5667 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5668 }
5669
5670 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005671 return PyBytes_FromStringAndSize(NULL, 0);
5672
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005673 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005675
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005676 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005678 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 if (repr == NULL)
5681 return NULL;
5682
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005683 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005685 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005686 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005687
Walter Dörwald79e913e2007-05-12 11:08:06 +00005688 /* Escape backslashes */
5689 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 *p++ = '\\';
5691 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005692 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005693 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005694
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005695 /* Map 21-bit characters to '\U00xxxxxx' */
5696 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005697 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005698 *p++ = '\\';
5699 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005700 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5701 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5702 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5703 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5704 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5705 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5706 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5707 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005709 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005710
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005712 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 *p++ = '\\';
5714 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005715 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5716 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5717 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5718 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005720
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005721 /* Map special whitespace to '\t', \n', '\r' */
5722 else if (ch == '\t') {
5723 *p++ = '\\';
5724 *p++ = 't';
5725 }
5726 else if (ch == '\n') {
5727 *p++ = '\\';
5728 *p++ = 'n';
5729 }
5730 else if (ch == '\r') {
5731 *p++ = '\\';
5732 *p++ = 'r';
5733 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005734
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005735 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005736 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005738 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005739 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5740 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005741 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005742
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 /* Copy everything else as-is */
5744 else
5745 *p++ = (char) ch;
5746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005748 assert(p - PyBytes_AS_STRING(repr) > 0);
5749 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5750 return NULL;
5751 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752}
5753
Alexander Belopolsky40018472011-02-26 01:02:56 +00005754PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5756 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005758 PyObject *result;
5759 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5760 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005762 result = PyUnicode_AsUnicodeEscapeString(tmp);
5763 Py_DECREF(tmp);
5764 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765}
5766
5767/* --- Raw Unicode Escape Codec ------------------------------------------- */
5768
Alexander Belopolsky40018472011-02-26 01:02:56 +00005769PyObject *
5770PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005771 Py_ssize_t size,
5772 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005775 Py_ssize_t startinpos;
5776 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005777 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 const char *end;
5779 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 PyObject *errorHandler = NULL;
5781 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005782
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005783 if (size == 0) {
5784 Py_INCREF(unicode_empty);
5785 return unicode_empty;
5786 }
5787
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 /* Escaped strings will always be longer than the resulting
5789 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 length after conversion to the true value. (But decoding error
5791 handler might have to resize the string) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005792 _PyUnicodeWriter_Init(&writer, 1);
5793 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005795
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 end = s + size;
5797 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 unsigned char c;
5799 Py_UCS4 x;
5800 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005801 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 /* Non-escape characters are interpreted as Unicode ordinals */
5804 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005805 x = (unsigned char)*s++;
5806 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005807 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005808 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5809 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005811 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 startinpos = s-starts;
5813
5814 /* \u-escapes are only interpreted iff the number of leading
5815 backslashes if odd */
5816 bs = s;
5817 for (;s < end;) {
5818 if (*s != '\\')
5819 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005820 x = (unsigned char)*s++;
5821 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005822 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005823 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5824 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 }
5826 if (((s - bs) & 1) == 0 ||
5827 s >= end ||
5828 (*s != 'u' && *s != 'U')) {
5829 continue;
5830 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005831 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 count = *s=='u' ? 4 : 8;
5833 s++;
5834
5835 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 for (x = 0, i = 0; i < count; ++i, ++s) {
5837 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005838 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005840 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 errors, &errorHandler,
5842 "rawunicodeescape", "truncated \\uXXXX",
5843 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005844 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 goto onError;
5846 goto nextByte;
5847 }
5848 x = (x<<4) & ~0xF;
5849 if (c >= '0' && c <= '9')
5850 x += c - '0';
5851 else if (c >= 'a' && c <= 'f')
5852 x += 10 + c - 'a';
5853 else
5854 x += 10 + c - 'A';
5855 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005856 if (x <= MAX_UNICODE) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005857 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005858 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005859 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5860 writer.pos++;
5861 }
5862 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005863 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005864 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005865 errors, &errorHandler,
5866 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005868 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005870 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 nextByte:
5872 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 Py_XDECREF(errorHandler);
5875 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005876 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005877
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005879 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 Py_XDECREF(errorHandler);
5881 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 return NULL;
5883}
5884
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005885
Alexander Belopolsky40018472011-02-26 01:02:56 +00005886PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005887PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005889 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 char *p;
5891 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 Py_ssize_t expandsize, pos;
5893 int kind;
5894 void *data;
5895 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005897 if (!PyUnicode_Check(unicode)) {
5898 PyErr_BadArgument();
5899 return NULL;
5900 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005901 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902 return NULL;
5903 kind = PyUnicode_KIND(unicode);
5904 data = PyUnicode_DATA(unicode);
5905 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005906 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5907 bytes, and 1 byte characters 4. */
5908 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005909
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005910 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005911 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005912
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005913 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 if (repr == NULL)
5915 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005916 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005917 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005919 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005920 for (pos = 0; pos < len; pos++) {
5921 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 /* Map 32-bit characters to '\Uxxxxxxxx' */
5923 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005924 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005925 *p++ = '\\';
5926 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005927 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5928 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5929 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5930 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5931 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5932 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5933 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5934 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005935 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005937 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 *p++ = '\\';
5939 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005940 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5941 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5942 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5943 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 /* Copy everything else as-is */
5946 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 *p++ = (char) ch;
5948 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005949
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005950 assert(p > q);
5951 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005952 return NULL;
5953 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954}
5955
Alexander Belopolsky40018472011-02-26 01:02:56 +00005956PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005957PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5958 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005960 PyObject *result;
5961 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5962 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005963 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005964 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5965 Py_DECREF(tmp);
5966 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967}
5968
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005969/* --- Unicode Internal Codec ------------------------------------------- */
5970
Alexander Belopolsky40018472011-02-26 01:02:56 +00005971PyObject *
5972_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005973 Py_ssize_t size,
5974 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005975{
5976 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005977 Py_ssize_t startinpos;
5978 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005979 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005980 const char *end;
5981 const char *reason;
5982 PyObject *errorHandler = NULL;
5983 PyObject *exc = NULL;
5984
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005985 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005986 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005987 1))
5988 return NULL;
5989
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005990 if (size == 0) {
5991 Py_INCREF(unicode_empty);
5992 return unicode_empty;
5993 }
5994
Thomas Wouters89f507f2006-12-13 04:49:30 +00005995 /* XXX overflow detection missing */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005996 _PyUnicodeWriter_Init(&writer, 0);
5997 if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005999 end = s + size;
6000
6001 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006002 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006003 Py_UCS4 ch;
6004 /* We copy the raw representation one byte at a time because the
6005 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006006 ((char *) &uch)[0] = s[0];
6007 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006008#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006009 ((char *) &uch)[2] = s[2];
6010 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006011#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006012 ch = uch;
6013
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006014 /* We have to sanity check the raw data, otherwise doom looms for
6015 some malformed UCS-4 data. */
6016 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006017#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006018 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006019#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006020 end-s < Py_UNICODE_SIZE
6021 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006023 startinpos = s - starts;
6024 if (end-s < Py_UNICODE_SIZE) {
6025 endinpos = end-starts;
6026 reason = "truncated input";
6027 }
6028 else {
6029 endinpos = s - starts + Py_UNICODE_SIZE;
6030 reason = "illegal code point (> 0x10FFFF)";
6031 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006032 if (unicode_decode_call_errorhandler_writer(
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006033 errors, &errorHandler,
6034 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006035 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006036 &writer))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006037 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006038 continue;
6039 }
6040
6041 s += Py_UNICODE_SIZE;
6042#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006043 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006044 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006045 Py_UNICODE uch2;
6046 ((char *) &uch2)[0] = s[0];
6047 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006048 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006049 {
Victor Stinner551ac952011-11-29 22:58:13 +01006050 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006051 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006052 }
6053 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006054#endif
6055
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006056 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006057 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006058 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
6059 writer.pos++;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006060 }
6061
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006062 Py_XDECREF(errorHandler);
6063 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006064 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006065
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006067 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006068 Py_XDECREF(errorHandler);
6069 Py_XDECREF(exc);
6070 return NULL;
6071}
6072
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073/* --- Latin-1 Codec ------------------------------------------------------ */
6074
Alexander Belopolsky40018472011-02-26 01:02:56 +00006075PyObject *
6076PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006077 Py_ssize_t size,
6078 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006081 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082}
6083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006084/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006085static void
6086make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006087 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006088 PyObject *unicode,
6089 Py_ssize_t startpos, Py_ssize_t endpos,
6090 const char *reason)
6091{
6092 if (*exceptionObject == NULL) {
6093 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006094 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006095 encoding, unicode, startpos, endpos, reason);
6096 }
6097 else {
6098 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6099 goto onError;
6100 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6101 goto onError;
6102 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6103 goto onError;
6104 return;
6105 onError:
6106 Py_DECREF(*exceptionObject);
6107 *exceptionObject = NULL;
6108 }
6109}
6110
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006112static void
6113raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006114 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006115 PyObject *unicode,
6116 Py_ssize_t startpos, Py_ssize_t endpos,
6117 const char *reason)
6118{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006119 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006120 encoding, unicode, startpos, endpos, reason);
6121 if (*exceptionObject != NULL)
6122 PyCodec_StrictErrors(*exceptionObject);
6123}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006124
6125/* error handling callback helper:
6126 build arguments, call the callback and check the arguments,
6127 put the result into newpos and return the replacement string, which
6128 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006129static PyObject *
6130unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006131 PyObject **errorHandler,
6132 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006133 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006134 Py_ssize_t startpos, Py_ssize_t endpos,
6135 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006137 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006138 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139 PyObject *restuple;
6140 PyObject *resunicode;
6141
6142 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006144 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006146 }
6147
Benjamin Petersonbac79492012-01-14 13:34:47 -05006148 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006149 return NULL;
6150 len = PyUnicode_GET_LENGTH(unicode);
6151
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006152 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006153 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006156
6157 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006159 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006161 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006162 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 Py_DECREF(restuple);
6164 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006165 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006166 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 &resunicode, newpos)) {
6168 Py_DECREF(restuple);
6169 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006170 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006171 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6172 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6173 Py_DECREF(restuple);
6174 return NULL;
6175 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006177 *newpos = len + *newpos;
6178 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6180 Py_DECREF(restuple);
6181 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006182 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006183 Py_INCREF(resunicode);
6184 Py_DECREF(restuple);
6185 return resunicode;
6186}
6187
Alexander Belopolsky40018472011-02-26 01:02:56 +00006188static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006189unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006190 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006191 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006193 /* input state */
6194 Py_ssize_t pos=0, size;
6195 int kind;
6196 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006197 /* output object */
6198 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006199 /* pointer into the output */
6200 char *str;
6201 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006202 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006203 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6204 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006205 PyObject *errorHandler = NULL;
6206 PyObject *exc = NULL;
6207 /* the following variable is used for caching string comparisons
6208 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6209 int known_errorHandler = -1;
6210
Benjamin Petersonbac79492012-01-14 13:34:47 -05006211 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006212 return NULL;
6213 size = PyUnicode_GET_LENGTH(unicode);
6214 kind = PyUnicode_KIND(unicode);
6215 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006216 /* allocate enough for a simple encoding without
6217 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006218 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006219 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006220 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006221 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006222 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006223 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006224 ressize = size;
6225
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006226 while (pos < size) {
6227 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006228
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 /* can we encode this? */
6230 if (c<limit) {
6231 /* no overflow check, because we know that the space is enough */
6232 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006233 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006234 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 Py_ssize_t requiredsize;
6237 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006238 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006240 Py_ssize_t collstart = pos;
6241 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006243 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 ++collend;
6245 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6246 if (known_errorHandler==-1) {
6247 if ((errors==NULL) || (!strcmp(errors, "strict")))
6248 known_errorHandler = 1;
6249 else if (!strcmp(errors, "replace"))
6250 known_errorHandler = 2;
6251 else if (!strcmp(errors, "ignore"))
6252 known_errorHandler = 3;
6253 else if (!strcmp(errors, "xmlcharrefreplace"))
6254 known_errorHandler = 4;
6255 else
6256 known_errorHandler = 0;
6257 }
6258 switch (known_errorHandler) {
6259 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006260 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 goto onError;
6262 case 2: /* replace */
6263 while (collstart++<collend)
6264 *str++ = '?'; /* fall through */
6265 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006266 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 break;
6268 case 4: /* xmlcharrefreplace */
6269 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006270 /* determine replacement size */
6271 for (i = collstart, repsize = 0; i < collend; ++i) {
6272 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6273 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006275 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006277 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006279 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006281 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006283 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006285 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006286 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006288 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006290 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 if (requiredsize > ressize) {
6292 if (requiredsize<2*ressize)
6293 requiredsize = 2*ressize;
6294 if (_PyBytes_Resize(&res, requiredsize))
6295 goto onError;
6296 str = PyBytes_AS_STRING(res) + respos;
6297 ressize = requiredsize;
6298 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006299 /* generate replacement */
6300 for (i = collstart; i < collend; ++i) {
6301 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006303 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006304 break;
6305 default:
6306 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006307 encoding, reason, unicode, &exc,
6308 collstart, collend, &newpos);
6309 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006310 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006312 if (PyBytes_Check(repunicode)) {
6313 /* Directly copy bytes result to output. */
6314 repsize = PyBytes_Size(repunicode);
6315 if (repsize > 1) {
6316 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006317 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006318 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6319 Py_DECREF(repunicode);
6320 goto onError;
6321 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006322 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006323 ressize += repsize-1;
6324 }
6325 memcpy(str, PyBytes_AsString(repunicode), repsize);
6326 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006327 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006328 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006329 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006330 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 /* need more space? (at least enough for what we
6332 have+the replacement+the rest of the string, so
6333 we won't have to check space for encodable characters) */
6334 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006335 repsize = PyUnicode_GET_LENGTH(repunicode);
6336 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 if (requiredsize > ressize) {
6338 if (requiredsize<2*ressize)
6339 requiredsize = 2*ressize;
6340 if (_PyBytes_Resize(&res, requiredsize)) {
6341 Py_DECREF(repunicode);
6342 goto onError;
6343 }
6344 str = PyBytes_AS_STRING(res) + respos;
6345 ressize = requiredsize;
6346 }
6347 /* check if there is anything unencodable in the replacement
6348 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006349 for (i = 0; repsize-->0; ++i, ++str) {
6350 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006352 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006353 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 Py_DECREF(repunicode);
6355 goto onError;
6356 }
6357 *str = (char)c;
6358 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006359 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006360 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006361 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006362 }
6363 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006364 /* Resize if we allocated to much */
6365 size = str - PyBytes_AS_STRING(res);
6366 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006367 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006368 if (_PyBytes_Resize(&res, size) < 0)
6369 goto onError;
6370 }
6371
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006372 Py_XDECREF(errorHandler);
6373 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006374 return res;
6375
6376 onError:
6377 Py_XDECREF(res);
6378 Py_XDECREF(errorHandler);
6379 Py_XDECREF(exc);
6380 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006381}
6382
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006383/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006384PyObject *
6385PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006386 Py_ssize_t size,
6387 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006389 PyObject *result;
6390 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6391 if (unicode == NULL)
6392 return NULL;
6393 result = unicode_encode_ucs1(unicode, errors, 256);
6394 Py_DECREF(unicode);
6395 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396}
6397
Alexander Belopolsky40018472011-02-26 01:02:56 +00006398PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006399_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400{
6401 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 PyErr_BadArgument();
6403 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006405 if (PyUnicode_READY(unicode) == -1)
6406 return NULL;
6407 /* Fast path: if it is a one-byte string, construct
6408 bytes object directly. */
6409 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6410 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6411 PyUnicode_GET_LENGTH(unicode));
6412 /* Non-Latin-1 characters present. Defer to above function to
6413 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006414 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006415}
6416
6417PyObject*
6418PyUnicode_AsLatin1String(PyObject *unicode)
6419{
6420 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421}
6422
6423/* --- 7-bit ASCII Codec -------------------------------------------------- */
6424
Alexander Belopolsky40018472011-02-26 01:02:56 +00006425PyObject *
6426PyUnicode_DecodeASCII(const char *s,
6427 Py_ssize_t size,
6428 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006431 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006432 int kind;
6433 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006434 Py_ssize_t startinpos;
6435 Py_ssize_t endinpos;
6436 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 const char *e;
6438 PyObject *errorHandler = NULL;
6439 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006440
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006441 if (size == 0) {
6442 Py_INCREF(unicode_empty);
6443 return unicode_empty;
6444 }
6445
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006447 if (size == 1 && (unsigned char)s[0] < 128)
6448 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006449
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006450 _PyUnicodeWriter_Init(&writer, 0);
6451 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006453
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006454 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006455 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006456 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006457 writer.pos = outpos;
6458 if (writer.pos == size)
6459 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006460
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006461 s += writer.pos;
6462 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 register unsigned char c = (unsigned char)*s;
6465 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006466 PyUnicode_WRITE(kind, data, writer.pos, c);
6467 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 ++s;
6469 }
6470 else {
6471 startinpos = s-starts;
6472 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006473 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 errors, &errorHandler,
6475 "ascii", "ordinal not in range(128)",
6476 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006477 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006479 kind = writer.kind;
6480 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 Py_XDECREF(errorHandler);
6484 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006485 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006486
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006488 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006489 Py_XDECREF(errorHandler);
6490 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 return NULL;
6492}
6493
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006494/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006495PyObject *
6496PyUnicode_EncodeASCII(const Py_UNICODE *p,
6497 Py_ssize_t size,
6498 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 PyObject *result;
6501 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6502 if (unicode == NULL)
6503 return NULL;
6504 result = unicode_encode_ucs1(unicode, errors, 128);
6505 Py_DECREF(unicode);
6506 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507}
6508
Alexander Belopolsky40018472011-02-26 01:02:56 +00006509PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006510_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511{
6512 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 PyErr_BadArgument();
6514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006516 if (PyUnicode_READY(unicode) == -1)
6517 return NULL;
6518 /* Fast path: if it is an ASCII-only string, construct bytes object
6519 directly. Else defer to above function to raise the exception. */
6520 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6521 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6522 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006523 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006524}
6525
6526PyObject *
6527PyUnicode_AsASCIIString(PyObject *unicode)
6528{
6529 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530}
6531
Victor Stinner99b95382011-07-04 14:23:54 +02006532#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006533
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006534/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006535
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006536#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006537#define NEED_RETRY
6538#endif
6539
Victor Stinner3a50e702011-10-18 21:21:00 +02006540#ifndef WC_ERR_INVALID_CHARS
6541# define WC_ERR_INVALID_CHARS 0x0080
6542#endif
6543
6544static char*
6545code_page_name(UINT code_page, PyObject **obj)
6546{
6547 *obj = NULL;
6548 if (code_page == CP_ACP)
6549 return "mbcs";
6550 if (code_page == CP_UTF7)
6551 return "CP_UTF7";
6552 if (code_page == CP_UTF8)
6553 return "CP_UTF8";
6554
6555 *obj = PyBytes_FromFormat("cp%u", code_page);
6556 if (*obj == NULL)
6557 return NULL;
6558 return PyBytes_AS_STRING(*obj);
6559}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006560
Alexander Belopolsky40018472011-02-26 01:02:56 +00006561static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006562is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006563{
6564 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006565 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006566
Victor Stinner3a50e702011-10-18 21:21:00 +02006567 if (!IsDBCSLeadByteEx(code_page, *curr))
6568 return 0;
6569
6570 prev = CharPrevExA(code_page, s, curr, 0);
6571 if (prev == curr)
6572 return 1;
6573 /* FIXME: This code is limited to "true" double-byte encodings,
6574 as it assumes an incomplete character consists of a single
6575 byte. */
6576 if (curr - prev == 2)
6577 return 1;
6578 if (!IsDBCSLeadByteEx(code_page, *prev))
6579 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006580 return 0;
6581}
6582
Victor Stinner3a50e702011-10-18 21:21:00 +02006583static DWORD
6584decode_code_page_flags(UINT code_page)
6585{
6586 if (code_page == CP_UTF7) {
6587 /* The CP_UTF7 decoder only supports flags=0 */
6588 return 0;
6589 }
6590 else
6591 return MB_ERR_INVALID_CHARS;
6592}
6593
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006594/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006595 * Decode a byte string from a Windows code page into unicode object in strict
6596 * mode.
6597 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006598 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6599 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006600 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006601static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006602decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006603 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006604 const char *in,
6605 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006606{
Victor Stinner3a50e702011-10-18 21:21:00 +02006607 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006608 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006609 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006610
6611 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006612 assert(insize > 0);
6613 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6614 if (outsize <= 0)
6615 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006616
6617 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006619 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006620 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 if (*v == NULL)
6622 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006623 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006624 }
6625 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006627 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006628 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006630 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006631 }
6632
6633 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006634 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6635 if (outsize <= 0)
6636 goto error;
6637 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006638
Victor Stinner3a50e702011-10-18 21:21:00 +02006639error:
6640 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6641 return -2;
6642 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006643 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006644}
6645
Victor Stinner3a50e702011-10-18 21:21:00 +02006646/*
6647 * Decode a byte string from a code page into unicode object with an error
6648 * handler.
6649 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006650 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006651 * UnicodeDecodeError exception and returns -1 on error.
6652 */
6653static int
6654decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006655 PyObject **v,
6656 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006657 const char *errors)
6658{
6659 const char *startin = in;
6660 const char *endin = in + size;
6661 const DWORD flags = decode_code_page_flags(code_page);
6662 /* Ideally, we should get reason from FormatMessage. This is the Windows
6663 2000 English version of the message. */
6664 const char *reason = "No mapping for the Unicode character exists "
6665 "in the target code page.";
6666 /* each step cannot decode more than 1 character, but a character can be
6667 represented as a surrogate pair */
6668 wchar_t buffer[2], *startout, *out;
6669 int insize, outsize;
6670 PyObject *errorHandler = NULL;
6671 PyObject *exc = NULL;
6672 PyObject *encoding_obj = NULL;
6673 char *encoding;
6674 DWORD err;
6675 int ret = -1;
6676
6677 assert(size > 0);
6678
6679 encoding = code_page_name(code_page, &encoding_obj);
6680 if (encoding == NULL)
6681 return -1;
6682
6683 if (errors == NULL || strcmp(errors, "strict") == 0) {
6684 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6685 UnicodeDecodeError. */
6686 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6687 if (exc != NULL) {
6688 PyCodec_StrictErrors(exc);
6689 Py_CLEAR(exc);
6690 }
6691 goto error;
6692 }
6693
6694 if (*v == NULL) {
6695 /* Create unicode object */
6696 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6697 PyErr_NoMemory();
6698 goto error;
6699 }
Victor Stinnerab595942011-12-17 04:59:06 +01006700 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006701 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006702 if (*v == NULL)
6703 goto error;
6704 startout = PyUnicode_AS_UNICODE(*v);
6705 }
6706 else {
6707 /* Extend unicode object */
6708 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6709 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6710 PyErr_NoMemory();
6711 goto error;
6712 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006713 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006714 goto error;
6715 startout = PyUnicode_AS_UNICODE(*v) + n;
6716 }
6717
6718 /* Decode the byte string character per character */
6719 out = startout;
6720 while (in < endin)
6721 {
6722 /* Decode a character */
6723 insize = 1;
6724 do
6725 {
6726 outsize = MultiByteToWideChar(code_page, flags,
6727 in, insize,
6728 buffer, Py_ARRAY_LENGTH(buffer));
6729 if (outsize > 0)
6730 break;
6731 err = GetLastError();
6732 if (err != ERROR_NO_UNICODE_TRANSLATION
6733 && err != ERROR_INSUFFICIENT_BUFFER)
6734 {
6735 PyErr_SetFromWindowsErr(0);
6736 goto error;
6737 }
6738 insize++;
6739 }
6740 /* 4=maximum length of a UTF-8 sequence */
6741 while (insize <= 4 && (in + insize) <= endin);
6742
6743 if (outsize <= 0) {
6744 Py_ssize_t startinpos, endinpos, outpos;
6745
6746 startinpos = in - startin;
6747 endinpos = startinpos + 1;
6748 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006749 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006750 errors, &errorHandler,
6751 encoding, reason,
6752 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006753 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006754 {
6755 goto error;
6756 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006757 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006758 }
6759 else {
6760 in += insize;
6761 memcpy(out, buffer, outsize * sizeof(wchar_t));
6762 out += outsize;
6763 }
6764 }
6765
6766 /* write a NUL character at the end */
6767 *out = 0;
6768
6769 /* Extend unicode object */
6770 outsize = out - startout;
6771 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006772 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006773 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006774 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006775
6776error:
6777 Py_XDECREF(encoding_obj);
6778 Py_XDECREF(errorHandler);
6779 Py_XDECREF(exc);
6780 return ret;
6781}
6782
Victor Stinner3a50e702011-10-18 21:21:00 +02006783static PyObject *
6784decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006785 const char *s, Py_ssize_t size,
6786 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006787{
Victor Stinner76a31a62011-11-04 00:05:13 +01006788 PyObject *v = NULL;
6789 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006790
Victor Stinner3a50e702011-10-18 21:21:00 +02006791 if (code_page < 0) {
6792 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6793 return NULL;
6794 }
6795
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006796 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006798
Victor Stinner76a31a62011-11-04 00:05:13 +01006799 do
6800 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006801#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006802 if (size > INT_MAX) {
6803 chunk_size = INT_MAX;
6804 final = 0;
6805 done = 0;
6806 }
6807 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006808#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006809 {
6810 chunk_size = (int)size;
6811 final = (consumed == NULL);
6812 done = 1;
6813 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006814
Victor Stinner76a31a62011-11-04 00:05:13 +01006815 /* Skip trailing lead-byte unless 'final' is set */
6816 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6817 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006818
Victor Stinner76a31a62011-11-04 00:05:13 +01006819 if (chunk_size == 0 && done) {
6820 if (v != NULL)
6821 break;
6822 Py_INCREF(unicode_empty);
6823 return unicode_empty;
6824 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006825
Victor Stinner76a31a62011-11-04 00:05:13 +01006826
6827 converted = decode_code_page_strict(code_page, &v,
6828 s, chunk_size);
6829 if (converted == -2)
6830 converted = decode_code_page_errors(code_page, &v,
6831 s, chunk_size,
6832 errors);
6833 assert(converted != 0);
6834
6835 if (converted < 0) {
6836 Py_XDECREF(v);
6837 return NULL;
6838 }
6839
6840 if (consumed)
6841 *consumed += converted;
6842
6843 s += converted;
6844 size -= converted;
6845 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006846
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006847 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006848}
6849
Alexander Belopolsky40018472011-02-26 01:02:56 +00006850PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006851PyUnicode_DecodeCodePageStateful(int code_page,
6852 const char *s,
6853 Py_ssize_t size,
6854 const char *errors,
6855 Py_ssize_t *consumed)
6856{
6857 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6858}
6859
6860PyObject *
6861PyUnicode_DecodeMBCSStateful(const char *s,
6862 Py_ssize_t size,
6863 const char *errors,
6864 Py_ssize_t *consumed)
6865{
6866 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6867}
6868
6869PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006870PyUnicode_DecodeMBCS(const char *s,
6871 Py_ssize_t size,
6872 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006873{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006874 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6875}
6876
Victor Stinner3a50e702011-10-18 21:21:00 +02006877static DWORD
6878encode_code_page_flags(UINT code_page, const char *errors)
6879{
6880 if (code_page == CP_UTF8) {
6881 if (winver.dwMajorVersion >= 6)
6882 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6883 and later */
6884 return WC_ERR_INVALID_CHARS;
6885 else
6886 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6887 return 0;
6888 }
6889 else if (code_page == CP_UTF7) {
6890 /* CP_UTF7 only supports flags=0 */
6891 return 0;
6892 }
6893 else {
6894 if (errors != NULL && strcmp(errors, "replace") == 0)
6895 return 0;
6896 else
6897 return WC_NO_BEST_FIT_CHARS;
6898 }
6899}
6900
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006901/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006902 * Encode a Unicode string to a Windows code page into a byte string in strict
6903 * mode.
6904 *
6905 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006906 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006907 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006908static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006909encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006910 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006911 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006912{
Victor Stinner554f3f02010-06-16 23:33:54 +00006913 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006914 BOOL *pusedDefaultChar = &usedDefaultChar;
6915 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006916 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006917 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006918 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006919 const DWORD flags = encode_code_page_flags(code_page, NULL);
6920 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006921 /* Create a substring so that we can get the UTF-16 representation
6922 of just the slice under consideration. */
6923 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006924
Martin v. Löwis3d325192011-11-04 18:23:06 +01006925 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006926
Victor Stinner3a50e702011-10-18 21:21:00 +02006927 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006928 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006929 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006930 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006931
Victor Stinner2fc507f2011-11-04 20:06:39 +01006932 substring = PyUnicode_Substring(unicode, offset, offset+len);
6933 if (substring == NULL)
6934 return -1;
6935 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6936 if (p == NULL) {
6937 Py_DECREF(substring);
6938 return -1;
6939 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006940
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006941 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006942 outsize = WideCharToMultiByte(code_page, flags,
6943 p, size,
6944 NULL, 0,
6945 NULL, pusedDefaultChar);
6946 if (outsize <= 0)
6947 goto error;
6948 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006949 if (pusedDefaultChar && *pusedDefaultChar) {
6950 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006951 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006952 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006953
Victor Stinner3a50e702011-10-18 21:21:00 +02006954 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006955 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006956 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006957 if (*outbytes == NULL) {
6958 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006960 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006961 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006962 }
6963 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006964 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006965 const Py_ssize_t n = PyBytes_Size(*outbytes);
6966 if (outsize > PY_SSIZE_T_MAX - n) {
6967 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006968 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006970 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006971 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6972 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006973 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006974 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006975 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006976 }
6977
6978 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006979 outsize = WideCharToMultiByte(code_page, flags,
6980 p, size,
6981 out, outsize,
6982 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006983 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006984 if (outsize <= 0)
6985 goto error;
6986 if (pusedDefaultChar && *pusedDefaultChar)
6987 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006988 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006989
Victor Stinner3a50e702011-10-18 21:21:00 +02006990error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006991 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006992 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6993 return -2;
6994 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006995 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006996}
6997
Victor Stinner3a50e702011-10-18 21:21:00 +02006998/*
6999 * Encode a Unicode string to a Windows code page into a byte string using a
7000 * error handler.
7001 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007002 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007003 * -1 on other error.
7004 */
7005static int
7006encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007007 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007008 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007009{
Victor Stinner3a50e702011-10-18 21:21:00 +02007010 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007011 Py_ssize_t pos = unicode_offset;
7012 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007013 /* Ideally, we should get reason from FormatMessage. This is the Windows
7014 2000 English version of the message. */
7015 const char *reason = "invalid character";
7016 /* 4=maximum length of a UTF-8 sequence */
7017 char buffer[4];
7018 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7019 Py_ssize_t outsize;
7020 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007021 PyObject *errorHandler = NULL;
7022 PyObject *exc = NULL;
7023 PyObject *encoding_obj = NULL;
7024 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007025 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007026 PyObject *rep;
7027 int ret = -1;
7028
7029 assert(insize > 0);
7030
7031 encoding = code_page_name(code_page, &encoding_obj);
7032 if (encoding == NULL)
7033 return -1;
7034
7035 if (errors == NULL || strcmp(errors, "strict") == 0) {
7036 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7037 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007038 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007039 if (exc != NULL) {
7040 PyCodec_StrictErrors(exc);
7041 Py_DECREF(exc);
7042 }
7043 Py_XDECREF(encoding_obj);
7044 return -1;
7045 }
7046
7047 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7048 pusedDefaultChar = &usedDefaultChar;
7049 else
7050 pusedDefaultChar = NULL;
7051
7052 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7053 PyErr_NoMemory();
7054 goto error;
7055 }
7056 outsize = insize * Py_ARRAY_LENGTH(buffer);
7057
7058 if (*outbytes == NULL) {
7059 /* Create string object */
7060 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7061 if (*outbytes == NULL)
7062 goto error;
7063 out = PyBytes_AS_STRING(*outbytes);
7064 }
7065 else {
7066 /* Extend string object */
7067 Py_ssize_t n = PyBytes_Size(*outbytes);
7068 if (n > PY_SSIZE_T_MAX - outsize) {
7069 PyErr_NoMemory();
7070 goto error;
7071 }
7072 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7073 goto error;
7074 out = PyBytes_AS_STRING(*outbytes) + n;
7075 }
7076
7077 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007078 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007079 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007080 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7081 wchar_t chars[2];
7082 int charsize;
7083 if (ch < 0x10000) {
7084 chars[0] = (wchar_t)ch;
7085 charsize = 1;
7086 }
7087 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007088 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7089 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007090 charsize = 2;
7091 }
7092
Victor Stinner3a50e702011-10-18 21:21:00 +02007093 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007094 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007095 buffer, Py_ARRAY_LENGTH(buffer),
7096 NULL, pusedDefaultChar);
7097 if (outsize > 0) {
7098 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7099 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007100 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 memcpy(out, buffer, outsize);
7102 out += outsize;
7103 continue;
7104 }
7105 }
7106 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7107 PyErr_SetFromWindowsErr(0);
7108 goto error;
7109 }
7110
Victor Stinner3a50e702011-10-18 21:21:00 +02007111 rep = unicode_encode_call_errorhandler(
7112 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007113 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007114 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 if (rep == NULL)
7116 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007117 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007118
7119 if (PyBytes_Check(rep)) {
7120 outsize = PyBytes_GET_SIZE(rep);
7121 if (outsize != 1) {
7122 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7123 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7124 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7125 Py_DECREF(rep);
7126 goto error;
7127 }
7128 out = PyBytes_AS_STRING(*outbytes) + offset;
7129 }
7130 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7131 out += outsize;
7132 }
7133 else {
7134 Py_ssize_t i;
7135 enum PyUnicode_Kind kind;
7136 void *data;
7137
Benjamin Petersonbac79492012-01-14 13:34:47 -05007138 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 Py_DECREF(rep);
7140 goto error;
7141 }
7142
7143 outsize = PyUnicode_GET_LENGTH(rep);
7144 if (outsize != 1) {
7145 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7146 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7147 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7148 Py_DECREF(rep);
7149 goto error;
7150 }
7151 out = PyBytes_AS_STRING(*outbytes) + offset;
7152 }
7153 kind = PyUnicode_KIND(rep);
7154 data = PyUnicode_DATA(rep);
7155 for (i=0; i < outsize; i++) {
7156 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7157 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007158 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007159 encoding, unicode,
7160 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 "unable to encode error handler result to ASCII");
7162 Py_DECREF(rep);
7163 goto error;
7164 }
7165 *out = (unsigned char)ch;
7166 out++;
7167 }
7168 }
7169 Py_DECREF(rep);
7170 }
7171 /* write a NUL byte */
7172 *out = 0;
7173 outsize = out - PyBytes_AS_STRING(*outbytes);
7174 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7175 if (_PyBytes_Resize(outbytes, outsize) < 0)
7176 goto error;
7177 ret = 0;
7178
7179error:
7180 Py_XDECREF(encoding_obj);
7181 Py_XDECREF(errorHandler);
7182 Py_XDECREF(exc);
7183 return ret;
7184}
7185
Victor Stinner3a50e702011-10-18 21:21:00 +02007186static PyObject *
7187encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007188 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007189 const char *errors)
7190{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007191 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007193 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007194 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007195
Benjamin Petersonbac79492012-01-14 13:34:47 -05007196 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007197 return NULL;
7198 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007199
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 if (code_page < 0) {
7201 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7202 return NULL;
7203 }
7204
Martin v. Löwis3d325192011-11-04 18:23:06 +01007205 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007206 return PyBytes_FromStringAndSize(NULL, 0);
7207
Victor Stinner7581cef2011-11-03 22:32:33 +01007208 offset = 0;
7209 do
7210 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007211#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007212 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007213 chunks. */
7214 if (len > INT_MAX/2) {
7215 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007216 done = 0;
7217 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007218 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007219#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007220 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007221 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007222 done = 1;
7223 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007224
Victor Stinner76a31a62011-11-04 00:05:13 +01007225 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007226 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007227 errors);
7228 if (ret == -2)
7229 ret = encode_code_page_errors(code_page, &outbytes,
7230 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007231 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007232 if (ret < 0) {
7233 Py_XDECREF(outbytes);
7234 return NULL;
7235 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007236
Victor Stinner7581cef2011-11-03 22:32:33 +01007237 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007238 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007239 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007240
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 return outbytes;
7242}
7243
7244PyObject *
7245PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7246 Py_ssize_t size,
7247 const char *errors)
7248{
Victor Stinner7581cef2011-11-03 22:32:33 +01007249 PyObject *unicode, *res;
7250 unicode = PyUnicode_FromUnicode(p, size);
7251 if (unicode == NULL)
7252 return NULL;
7253 res = encode_code_page(CP_ACP, unicode, errors);
7254 Py_DECREF(unicode);
7255 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007256}
7257
7258PyObject *
7259PyUnicode_EncodeCodePage(int code_page,
7260 PyObject *unicode,
7261 const char *errors)
7262{
Victor Stinner7581cef2011-11-03 22:32:33 +01007263 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007264}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007265
Alexander Belopolsky40018472011-02-26 01:02:56 +00007266PyObject *
7267PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007268{
7269 if (!PyUnicode_Check(unicode)) {
7270 PyErr_BadArgument();
7271 return NULL;
7272 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007273 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007274}
7275
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007276#undef NEED_RETRY
7277
Victor Stinner99b95382011-07-04 14:23:54 +02007278#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007279
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280/* --- Character Mapping Codec -------------------------------------------- */
7281
Alexander Belopolsky40018472011-02-26 01:02:56 +00007282PyObject *
7283PyUnicode_DecodeCharmap(const char *s,
7284 Py_ssize_t size,
7285 PyObject *mapping,
7286 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007288 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007289 Py_ssize_t startinpos;
7290 Py_ssize_t endinpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007291 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007292 _PyUnicodeWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007293 PyObject *errorHandler = NULL;
7294 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007295
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296 /* Default to Latin-1 */
7297 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007300 if (size == 0) {
7301 Py_INCREF(unicode_empty);
7302 return unicode_empty;
7303 }
7304 _PyUnicodeWriter_Init(&writer, 0);
7305 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007307
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007308 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007309 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007310 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007311 enum PyUnicode_Kind mapkind;
7312 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007313 Py_UCS4 x;
7314
Benjamin Petersonbac79492012-01-14 13:34:47 -05007315 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007316 return NULL;
7317
7318 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007319 mapdata = PyUnicode_DATA(mapping);
7320 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007322 unsigned char ch;
7323 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007324 enum PyUnicode_Kind outkind = writer.kind;
7325 void *outdata = writer.data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007326 if (outkind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007327 Py_UCS4 maxchar = writer.maxchar;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007328 while (s < e) {
7329 unsigned char ch = *s;
7330 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7331 if (x > maxchar)
7332 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007333 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, writer.pos, x);
7334 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007335 ++s;
7336 }
7337 break;
7338 }
7339 else if (outkind == PyUnicode_2BYTE_KIND) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007340 while (s < e) {
7341 unsigned char ch = *s;
7342 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7343 if (x == 0xFFFE)
7344 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007345 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, writer.pos, x);
7346 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007347 ++s;
7348 }
7349 break;
7350 }
7351 }
7352 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353
Benjamin Peterson29060642009-01-31 22:14:21 +00007354 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007355 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007356 else
7357 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007358Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007359 if (x == 0xfffe)
7360 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007362 startinpos = s-starts;
7363 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007364 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 errors, &errorHandler,
7366 "charmap", "character maps to <undefined>",
7367 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007368 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 goto onError;
7370 }
7371 continue;
7372 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007373
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007374 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007375 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007376 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
7377 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007379 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007380 }
7381 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 while (s < e) {
7383 unsigned char ch = *s;
7384 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007385
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7387 w = PyLong_FromLong((long)ch);
7388 if (w == NULL)
7389 goto onError;
7390 x = PyObject_GetItem(mapping, w);
7391 Py_DECREF(w);
7392 if (x == NULL) {
7393 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7394 /* No mapping found means: mapping is undefined. */
7395 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007396 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 } else
7398 goto onError;
7399 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007400
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007402 if (x == Py_None)
7403 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007404 if (PyLong_Check(x)) {
7405 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007406 if (value == 0xFFFE)
7407 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007408 if (value < 0 || value > MAX_UNICODE) {
7409 PyErr_Format(PyExc_TypeError,
7410 "character mapping must be in range(0x%lx)",
7411 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 Py_DECREF(x);
7413 goto onError;
7414 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007415
7416 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007417 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007418 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7419 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 else if (PyUnicode_Check(x)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05007422 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007423 goto onError;
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007424 if (PyUnicode_GET_LENGTH(x) == 1) {
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007425 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007426 if (value == 0xFFFE)
7427 goto Undefined;
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007428 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1)
7429 goto onError;
7430 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7431 writer.pos++;
7432 }
7433 else {
7434 writer.overallocate = 1;
7435 if (_PyUnicodeWriter_WriteStr(&writer, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007436 goto onError;
7437 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 }
7439 else {
7440 /* wrong return value */
7441 PyErr_SetString(PyExc_TypeError,
7442 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007443 Py_DECREF(x);
7444 goto onError;
7445 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 Py_DECREF(x);
7447 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007448 continue;
7449Undefined:
7450 /* undefined mapping */
7451 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007452 startinpos = s-starts;
7453 endinpos = startinpos+1;
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007454 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007455 errors, &errorHandler,
7456 "charmap", "character maps to <undefined>",
7457 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007458 &writer)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007459 goto onError;
7460 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007463 Py_XDECREF(errorHandler);
7464 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007465 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007466
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007468 Py_XDECREF(errorHandler);
7469 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007470 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 return NULL;
7472}
7473
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007474/* Charmap encoding: the lookup table */
7475
Alexander Belopolsky40018472011-02-26 01:02:56 +00007476struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 PyObject_HEAD
7478 unsigned char level1[32];
7479 int count2, count3;
7480 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007481};
7482
7483static PyObject*
7484encoding_map_size(PyObject *obj, PyObject* args)
7485{
7486 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007487 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007489}
7490
7491static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007492 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 PyDoc_STR("Return the size (in bytes) of this object") },
7494 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007495};
7496
7497static void
7498encoding_map_dealloc(PyObject* o)
7499{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007500 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007501}
7502
7503static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007504 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007505 "EncodingMap", /*tp_name*/
7506 sizeof(struct encoding_map), /*tp_basicsize*/
7507 0, /*tp_itemsize*/
7508 /* methods */
7509 encoding_map_dealloc, /*tp_dealloc*/
7510 0, /*tp_print*/
7511 0, /*tp_getattr*/
7512 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007513 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 0, /*tp_repr*/
7515 0, /*tp_as_number*/
7516 0, /*tp_as_sequence*/
7517 0, /*tp_as_mapping*/
7518 0, /*tp_hash*/
7519 0, /*tp_call*/
7520 0, /*tp_str*/
7521 0, /*tp_getattro*/
7522 0, /*tp_setattro*/
7523 0, /*tp_as_buffer*/
7524 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7525 0, /*tp_doc*/
7526 0, /*tp_traverse*/
7527 0, /*tp_clear*/
7528 0, /*tp_richcompare*/
7529 0, /*tp_weaklistoffset*/
7530 0, /*tp_iter*/
7531 0, /*tp_iternext*/
7532 encoding_map_methods, /*tp_methods*/
7533 0, /*tp_members*/
7534 0, /*tp_getset*/
7535 0, /*tp_base*/
7536 0, /*tp_dict*/
7537 0, /*tp_descr_get*/
7538 0, /*tp_descr_set*/
7539 0, /*tp_dictoffset*/
7540 0, /*tp_init*/
7541 0, /*tp_alloc*/
7542 0, /*tp_new*/
7543 0, /*tp_free*/
7544 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007545};
7546
7547PyObject*
7548PyUnicode_BuildEncodingMap(PyObject* string)
7549{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007550 PyObject *result;
7551 struct encoding_map *mresult;
7552 int i;
7553 int need_dict = 0;
7554 unsigned char level1[32];
7555 unsigned char level2[512];
7556 unsigned char *mlevel1, *mlevel2, *mlevel3;
7557 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007558 int kind;
7559 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007560 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007561 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007562
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007563 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007564 PyErr_BadArgument();
7565 return NULL;
7566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007567 kind = PyUnicode_KIND(string);
7568 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007569 length = PyUnicode_GET_LENGTH(string);
7570 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007571 memset(level1, 0xFF, sizeof level1);
7572 memset(level2, 0xFF, sizeof level2);
7573
7574 /* If there isn't a one-to-one mapping of NULL to \0,
7575 or if there are non-BMP characters, we need to use
7576 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007577 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007578 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007579 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007580 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007581 ch = PyUnicode_READ(kind, data, i);
7582 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007583 need_dict = 1;
7584 break;
7585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007586 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007587 /* unmapped character */
7588 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007589 l1 = ch >> 11;
7590 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007591 if (level1[l1] == 0xFF)
7592 level1[l1] = count2++;
7593 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007594 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007595 }
7596
7597 if (count2 >= 0xFF || count3 >= 0xFF)
7598 need_dict = 1;
7599
7600 if (need_dict) {
7601 PyObject *result = PyDict_New();
7602 PyObject *key, *value;
7603 if (!result)
7604 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007605 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007606 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007607 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007608 if (!key || !value)
7609 goto failed1;
7610 if (PyDict_SetItem(result, key, value) == -1)
7611 goto failed1;
7612 Py_DECREF(key);
7613 Py_DECREF(value);
7614 }
7615 return result;
7616 failed1:
7617 Py_XDECREF(key);
7618 Py_XDECREF(value);
7619 Py_DECREF(result);
7620 return NULL;
7621 }
7622
7623 /* Create a three-level trie */
7624 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7625 16*count2 + 128*count3 - 1);
7626 if (!result)
7627 return PyErr_NoMemory();
7628 PyObject_Init(result, &EncodingMapType);
7629 mresult = (struct encoding_map*)result;
7630 mresult->count2 = count2;
7631 mresult->count3 = count3;
7632 mlevel1 = mresult->level1;
7633 mlevel2 = mresult->level23;
7634 mlevel3 = mresult->level23 + 16*count2;
7635 memcpy(mlevel1, level1, 32);
7636 memset(mlevel2, 0xFF, 16*count2);
7637 memset(mlevel3, 0, 128*count3);
7638 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007639 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007640 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007641 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7642 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007643 /* unmapped character */
7644 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007645 o1 = ch>>11;
7646 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007647 i2 = 16*mlevel1[o1] + o2;
7648 if (mlevel2[i2] == 0xFF)
7649 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007650 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007651 i3 = 128*mlevel2[i2] + o3;
7652 mlevel3[i3] = i;
7653 }
7654 return result;
7655}
7656
7657static int
Victor Stinner22168992011-11-20 17:09:18 +01007658encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007659{
7660 struct encoding_map *map = (struct encoding_map*)mapping;
7661 int l1 = c>>11;
7662 int l2 = (c>>7) & 0xF;
7663 int l3 = c & 0x7F;
7664 int i;
7665
Victor Stinner22168992011-11-20 17:09:18 +01007666 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007668 if (c == 0)
7669 return 0;
7670 /* level 1*/
7671 i = map->level1[l1];
7672 if (i == 0xFF) {
7673 return -1;
7674 }
7675 /* level 2*/
7676 i = map->level23[16*i+l2];
7677 if (i == 0xFF) {
7678 return -1;
7679 }
7680 /* level 3 */
7681 i = map->level23[16*map->count2 + 128*i + l3];
7682 if (i == 0) {
7683 return -1;
7684 }
7685 return i;
7686}
7687
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007688/* Lookup the character ch in the mapping. If the character
7689 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007690 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007691static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007692charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693{
Christian Heimes217cfd12007-12-02 14:31:20 +00007694 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007695 PyObject *x;
7696
7697 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007698 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007699 x = PyObject_GetItem(mapping, w);
7700 Py_DECREF(w);
7701 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7703 /* No mapping found means: mapping is undefined. */
7704 PyErr_Clear();
7705 x = Py_None;
7706 Py_INCREF(x);
7707 return x;
7708 } else
7709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007711 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007713 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 long value = PyLong_AS_LONG(x);
7715 if (value < 0 || value > 255) {
7716 PyErr_SetString(PyExc_TypeError,
7717 "character mapping must be in range(256)");
7718 Py_DECREF(x);
7719 return NULL;
7720 }
7721 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007723 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 /* wrong return value */
7727 PyErr_Format(PyExc_TypeError,
7728 "character mapping must return integer, bytes or None, not %.400s",
7729 x->ob_type->tp_name);
7730 Py_DECREF(x);
7731 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732 }
7733}
7734
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007735static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007736charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007737{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007738 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7739 /* exponentially overallocate to minimize reallocations */
7740 if (requiredsize < 2*outsize)
7741 requiredsize = 2*outsize;
7742 if (_PyBytes_Resize(outobj, requiredsize))
7743 return -1;
7744 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007745}
7746
Benjamin Peterson14339b62009-01-31 16:36:08 +00007747typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007749} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007750/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007751 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007752 space is available. Return a new reference to the object that
7753 was put in the output buffer, or Py_None, if the mapping was undefined
7754 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007755 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007756static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007757charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007758 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007759{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007760 PyObject *rep;
7761 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007762 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007763
Christian Heimes90aa7642007-12-19 02:45:37 +00007764 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007765 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007767 if (res == -1)
7768 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 if (outsize<requiredsize)
7770 if (charmapencode_resize(outobj, outpos, requiredsize))
7771 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007772 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 outstart[(*outpos)++] = (char)res;
7774 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007775 }
7776
7777 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007778 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007780 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 Py_DECREF(rep);
7782 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007783 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 if (PyLong_Check(rep)) {
7785 Py_ssize_t requiredsize = *outpos+1;
7786 if (outsize<requiredsize)
7787 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7788 Py_DECREF(rep);
7789 return enc_EXCEPTION;
7790 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007791 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007793 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 else {
7795 const char *repchars = PyBytes_AS_STRING(rep);
7796 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7797 Py_ssize_t requiredsize = *outpos+repsize;
7798 if (outsize<requiredsize)
7799 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7800 Py_DECREF(rep);
7801 return enc_EXCEPTION;
7802 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007803 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 memcpy(outstart + *outpos, repchars, repsize);
7805 *outpos += repsize;
7806 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007807 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007808 Py_DECREF(rep);
7809 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007810}
7811
7812/* handle an error in PyUnicode_EncodeCharmap
7813 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007814static int
7815charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007816 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007817 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007818 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007819 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007820{
7821 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007822 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007823 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007824 enum PyUnicode_Kind kind;
7825 void *data;
7826 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007828 Py_ssize_t collstartpos = *inpos;
7829 Py_ssize_t collendpos = *inpos+1;
7830 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007831 char *encoding = "charmap";
7832 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007833 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007834 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007835 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007836
Benjamin Petersonbac79492012-01-14 13:34:47 -05007837 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007838 return -1;
7839 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007840 /* find all unencodable characters */
7841 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007842 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007843 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007844 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007845 val = encoding_map_lookup(ch, mapping);
7846 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 break;
7848 ++collendpos;
7849 continue;
7850 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007851
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007852 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7853 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 if (rep==NULL)
7855 return -1;
7856 else if (rep!=Py_None) {
7857 Py_DECREF(rep);
7858 break;
7859 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007860 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007862 }
7863 /* cache callback name lookup
7864 * (if not done yet, i.e. it's the first error) */
7865 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 if ((errors==NULL) || (!strcmp(errors, "strict")))
7867 *known_errorHandler = 1;
7868 else if (!strcmp(errors, "replace"))
7869 *known_errorHandler = 2;
7870 else if (!strcmp(errors, "ignore"))
7871 *known_errorHandler = 3;
7872 else if (!strcmp(errors, "xmlcharrefreplace"))
7873 *known_errorHandler = 4;
7874 else
7875 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007876 }
7877 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007878 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007879 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007880 return -1;
7881 case 2: /* replace */
7882 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 x = charmapencode_output('?', mapping, res, respos);
7884 if (x==enc_EXCEPTION) {
7885 return -1;
7886 }
7887 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007888 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 return -1;
7890 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007891 }
7892 /* fall through */
7893 case 3: /* ignore */
7894 *inpos = collendpos;
7895 break;
7896 case 4: /* xmlcharrefreplace */
7897 /* generate replacement (temporarily (mis)uses p) */
7898 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 char buffer[2+29+1+1];
7900 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007901 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 for (cp = buffer; *cp; ++cp) {
7903 x = charmapencode_output(*cp, mapping, res, respos);
7904 if (x==enc_EXCEPTION)
7905 return -1;
7906 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007907 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 return -1;
7909 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007910 }
7911 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007912 *inpos = collendpos;
7913 break;
7914 default:
7915 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007916 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007918 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007919 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007920 if (PyBytes_Check(repunicode)) {
7921 /* Directly copy bytes result to output. */
7922 Py_ssize_t outsize = PyBytes_Size(*res);
7923 Py_ssize_t requiredsize;
7924 repsize = PyBytes_Size(repunicode);
7925 requiredsize = *respos + repsize;
7926 if (requiredsize > outsize)
7927 /* Make room for all additional bytes. */
7928 if (charmapencode_resize(res, respos, requiredsize)) {
7929 Py_DECREF(repunicode);
7930 return -1;
7931 }
7932 memcpy(PyBytes_AsString(*res) + *respos,
7933 PyBytes_AsString(repunicode), repsize);
7934 *respos += repsize;
7935 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007936 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007937 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007938 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007939 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007940 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007941 Py_DECREF(repunicode);
7942 return -1;
7943 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007944 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007945 data = PyUnicode_DATA(repunicode);
7946 kind = PyUnicode_KIND(repunicode);
7947 for (index = 0; index < repsize; index++) {
7948 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7949 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007951 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007952 return -1;
7953 }
7954 else if (x==enc_FAILED) {
7955 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007956 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 return -1;
7958 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007959 }
7960 *inpos = newpos;
7961 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007962 }
7963 return 0;
7964}
7965
Alexander Belopolsky40018472011-02-26 01:02:56 +00007966PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007967_PyUnicode_EncodeCharmap(PyObject *unicode,
7968 PyObject *mapping,
7969 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007971 /* output object */
7972 PyObject *res = NULL;
7973 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007974 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007975 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007977 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007978 PyObject *errorHandler = NULL;
7979 PyObject *exc = NULL;
7980 /* the following variable is used for caching string comparisons
7981 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7982 * 3=ignore, 4=xmlcharrefreplace */
7983 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984
Benjamin Petersonbac79492012-01-14 13:34:47 -05007985 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007986 return NULL;
7987 size = PyUnicode_GET_LENGTH(unicode);
7988
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 /* Default to Latin-1 */
7990 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007991 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007993 /* allocate enough for a simple encoding without
7994 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007995 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007996 if (res == NULL)
7997 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007998 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007999 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008001 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008002 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008004 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 if (x==enc_EXCEPTION) /* error */
8006 goto onError;
8007 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008008 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 &exc,
8010 &known_errorHandler, &errorHandler, errors,
8011 &res, &respos)) {
8012 goto onError;
8013 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008014 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 else
8016 /* done with this character => adjust input position */
8017 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008020 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008021 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008022 if (_PyBytes_Resize(&res, respos) < 0)
8023 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008025 Py_XDECREF(exc);
8026 Py_XDECREF(errorHandler);
8027 return res;
8028
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008030 Py_XDECREF(res);
8031 Py_XDECREF(exc);
8032 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 return NULL;
8034}
8035
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008036/* Deprecated */
8037PyObject *
8038PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8039 Py_ssize_t size,
8040 PyObject *mapping,
8041 const char *errors)
8042{
8043 PyObject *result;
8044 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8045 if (unicode == NULL)
8046 return NULL;
8047 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8048 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008049 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008050}
8051
Alexander Belopolsky40018472011-02-26 01:02:56 +00008052PyObject *
8053PyUnicode_AsCharmapString(PyObject *unicode,
8054 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055{
8056 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 PyErr_BadArgument();
8058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008060 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061}
8062
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008063/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008064static void
8065make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008066 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008067 Py_ssize_t startpos, Py_ssize_t endpos,
8068 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008070 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008071 *exceptionObject = _PyUnicodeTranslateError_Create(
8072 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073 }
8074 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8076 goto onError;
8077 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8078 goto onError;
8079 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8080 goto onError;
8081 return;
8082 onError:
8083 Py_DECREF(*exceptionObject);
8084 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 }
8086}
8087
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088/* error handling callback helper:
8089 build arguments, call the callback and check the arguments,
8090 put the result into newpos and return the replacement string, which
8091 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008092static PyObject *
8093unicode_translate_call_errorhandler(const char *errors,
8094 PyObject **errorHandler,
8095 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008096 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008097 Py_ssize_t startpos, Py_ssize_t endpos,
8098 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008100 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008101
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008102 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008103 PyObject *restuple;
8104 PyObject *resunicode;
8105
8106 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008108 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110 }
8111
8112 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008114 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008116
8117 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008122 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 Py_DECREF(restuple);
8124 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008125 }
8126 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 &resunicode, &i_newpos)) {
8128 Py_DECREF(restuple);
8129 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008130 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008131 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008133 else
8134 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8137 Py_DECREF(restuple);
8138 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008139 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 Py_INCREF(resunicode);
8141 Py_DECREF(restuple);
8142 return resunicode;
8143}
8144
8145/* Lookup the character ch in the mapping and put the result in result,
8146 which must be decrefed by the caller.
8147 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008148static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008149charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008150{
Christian Heimes217cfd12007-12-02 14:31:20 +00008151 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008152 PyObject *x;
8153
8154 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008156 x = PyObject_GetItem(mapping, w);
8157 Py_DECREF(w);
8158 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8160 /* No mapping found means: use 1:1 mapping. */
8161 PyErr_Clear();
8162 *result = NULL;
8163 return 0;
8164 } else
8165 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008166 }
8167 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 *result = x;
8169 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008170 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008171 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 long value = PyLong_AS_LONG(x);
8173 long max = PyUnicode_GetMax();
8174 if (value < 0 || value > max) {
8175 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008176 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 Py_DECREF(x);
8178 return -1;
8179 }
8180 *result = x;
8181 return 0;
8182 }
8183 else if (PyUnicode_Check(x)) {
8184 *result = x;
8185 return 0;
8186 }
8187 else {
8188 /* wrong return value */
8189 PyErr_SetString(PyExc_TypeError,
8190 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 Py_DECREF(x);
8192 return -1;
8193 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008194}
8195/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 if not reallocate and adjust various state variables.
8197 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008198static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008199charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008201{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008202 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008203 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008204 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 /* exponentially overallocate to minimize reallocations */
8206 if (requiredsize < 2 * oldsize)
8207 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008208 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8209 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008211 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008212 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213 }
8214 return 0;
8215}
8216/* lookup the character, put the result in the output string and adjust
8217 various state variables. Return a new reference to the object that
8218 was put in the output buffer in *result, or Py_None, if the mapping was
8219 undefined (in which case no character was written).
8220 The called must decref result.
8221 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008222static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008223charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8224 PyObject *mapping, Py_UCS4 **output,
8225 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008226 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008228 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8229 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008231 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008234 }
8235 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008237 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008239 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240 }
8241 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008242 Py_ssize_t repsize;
8243 if (PyUnicode_READY(*res) == -1)
8244 return -1;
8245 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 if (repsize==1) {
8247 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 }
8250 else if (repsize!=0) {
8251 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 Py_ssize_t requiredsize = *opos +
8253 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008255 Py_ssize_t i;
8256 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 for(i = 0; i < repsize; i++)
8259 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008261 }
8262 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 return 0;
8265}
8266
Alexander Belopolsky40018472011-02-26 01:02:56 +00008267PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008268_PyUnicode_TranslateCharmap(PyObject *input,
8269 PyObject *mapping,
8270 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272 /* input object */
8273 char *idata;
8274 Py_ssize_t size, i;
8275 int kind;
8276 /* output buffer */
8277 Py_UCS4 *output = NULL;
8278 Py_ssize_t osize;
8279 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008281 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282 char *reason = "character maps to <undefined>";
8283 PyObject *errorHandler = NULL;
8284 PyObject *exc = NULL;
8285 /* the following variable is used for caching string comparisons
8286 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8287 * 3=ignore, 4=xmlcharrefreplace */
8288 int known_errorHandler = -1;
8289
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 PyErr_BadArgument();
8292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 if (PyUnicode_READY(input) == -1)
8296 return NULL;
8297 idata = (char*)PyUnicode_DATA(input);
8298 kind = PyUnicode_KIND(input);
8299 size = PyUnicode_GET_LENGTH(input);
8300 i = 0;
8301
8302 if (size == 0) {
8303 Py_INCREF(input);
8304 return input;
8305 }
8306
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008307 /* allocate enough for a simple 1:1 translation without
8308 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008309 osize = size;
8310 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8311 opos = 0;
8312 if (output == NULL) {
8313 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008317 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 /* try to encode it */
8319 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008320 if (charmaptranslate_output(input, i, mapping,
8321 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 Py_XDECREF(x);
8323 goto onError;
8324 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008325 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 else { /* untranslatable character */
8329 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8330 Py_ssize_t repsize;
8331 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334 Py_ssize_t collstart = i;
8335 Py_ssize_t collend = i+1;
8336 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008339 while (collend < size) {
8340 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 goto onError;
8342 Py_XDECREF(x);
8343 if (x!=Py_None)
8344 break;
8345 ++collend;
8346 }
8347 /* cache callback name lookup
8348 * (if not done yet, i.e. it's the first error) */
8349 if (known_errorHandler==-1) {
8350 if ((errors==NULL) || (!strcmp(errors, "strict")))
8351 known_errorHandler = 1;
8352 else if (!strcmp(errors, "replace"))
8353 known_errorHandler = 2;
8354 else if (!strcmp(errors, "ignore"))
8355 known_errorHandler = 3;
8356 else if (!strcmp(errors, "xmlcharrefreplace"))
8357 known_errorHandler = 4;
8358 else
8359 known_errorHandler = 0;
8360 }
8361 switch (known_errorHandler) {
8362 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008363 make_translate_exception(&exc,
8364 input, collstart, collend, reason);
8365 if (exc != NULL)
8366 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008367 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 case 2: /* replace */
8369 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 for (coll = collstart; coll<collend; coll++)
8371 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 /* fall through */
8373 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 break;
8376 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 /* generate replacement (temporarily (mis)uses i) */
8378 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 char buffer[2+29+1+1];
8380 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8382 if (charmaptranslate_makespace(&output, &osize,
8383 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 goto onError;
8385 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 break;
8390 default:
8391 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008392 reason, input, &exc,
8393 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008394 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008396 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008397 Py_DECREF(repunicode);
8398 goto onError;
8399 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 repsize = PyUnicode_GET_LENGTH(repunicode);
8402 if (charmaptranslate_makespace(&output, &osize,
8403 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 Py_DECREF(repunicode);
8405 goto onError;
8406 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008407 for (uni2 = 0; repsize-->0; ++uni2)
8408 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8409 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008411 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008412 }
8413 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008414 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8415 if (!res)
8416 goto onError;
8417 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 Py_XDECREF(exc);
8419 Py_XDECREF(errorHandler);
8420 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008423 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 Py_XDECREF(exc);
8425 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426 return NULL;
8427}
8428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008429/* Deprecated. Use PyUnicode_Translate instead. */
8430PyObject *
8431PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8432 Py_ssize_t size,
8433 PyObject *mapping,
8434 const char *errors)
8435{
Christian Heimes5f520f42012-09-11 14:03:25 +02008436 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8438 if (!unicode)
8439 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008440 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8441 Py_DECREF(unicode);
8442 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008443}
8444
Alexander Belopolsky40018472011-02-26 01:02:56 +00008445PyObject *
8446PyUnicode_Translate(PyObject *str,
8447 PyObject *mapping,
8448 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449{
8450 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008451
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 str = PyUnicode_FromObject(str);
8453 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008454 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456 Py_DECREF(str);
8457 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458}
Tim Petersced69f82003-09-16 20:30:58 +00008459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008461fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462{
8463 /* No need to call PyUnicode_READY(self) because this function is only
8464 called as a callback from fixup() which does it already. */
8465 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8466 const int kind = PyUnicode_KIND(self);
8467 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008468 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008469 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 Py_ssize_t i;
8471
8472 for (i = 0; i < len; ++i) {
8473 ch = PyUnicode_READ(kind, data, i);
8474 fixed = 0;
8475 if (ch > 127) {
8476 if (Py_UNICODE_ISSPACE(ch))
8477 fixed = ' ';
8478 else {
8479 const int decimal = Py_UNICODE_TODECIMAL(ch);
8480 if (decimal >= 0)
8481 fixed = '0' + decimal;
8482 }
8483 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008484 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008485 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486 PyUnicode_WRITE(kind, data, i, fixed);
8487 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008488 else
8489 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491 }
8492
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008493 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008494}
8495
8496PyObject *
8497_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8498{
8499 if (!PyUnicode_Check(unicode)) {
8500 PyErr_BadInternalCall();
8501 return NULL;
8502 }
8503 if (PyUnicode_READY(unicode) == -1)
8504 return NULL;
8505 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8506 /* If the string is already ASCII, just return the same string */
8507 Py_INCREF(unicode);
8508 return unicode;
8509 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008510 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511}
8512
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008513PyObject *
8514PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8515 Py_ssize_t length)
8516{
Victor Stinnerf0124502011-11-21 23:12:56 +01008517 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008518 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008519 Py_UCS4 maxchar;
8520 enum PyUnicode_Kind kind;
8521 void *data;
8522
Victor Stinner99d7ad02012-02-22 13:37:39 +01008523 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008524 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008525 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008526 if (ch > 127) {
8527 int decimal = Py_UNICODE_TODECIMAL(ch);
8528 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008529 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008530 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008531 }
8532 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008533
8534 /* Copy to a new string */
8535 decimal = PyUnicode_New(length, maxchar);
8536 if (decimal == NULL)
8537 return decimal;
8538 kind = PyUnicode_KIND(decimal);
8539 data = PyUnicode_DATA(decimal);
8540 /* Iterate over code points */
8541 for (i = 0; i < length; i++) {
8542 Py_UNICODE ch = s[i];
8543 if (ch > 127) {
8544 int decimal = Py_UNICODE_TODECIMAL(ch);
8545 if (decimal >= 0)
8546 ch = '0' + decimal;
8547 }
8548 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008550 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008551}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008552/* --- Decimal Encoder ---------------------------------------------------- */
8553
Alexander Belopolsky40018472011-02-26 01:02:56 +00008554int
8555PyUnicode_EncodeDecimal(Py_UNICODE *s,
8556 Py_ssize_t length,
8557 char *output,
8558 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008559{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008560 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008561 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008562 enum PyUnicode_Kind kind;
8563 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008564
8565 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 PyErr_BadArgument();
8567 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008568 }
8569
Victor Stinner42bf7752011-11-21 22:52:58 +01008570 unicode = PyUnicode_FromUnicode(s, length);
8571 if (unicode == NULL)
8572 return -1;
8573
Benjamin Petersonbac79492012-01-14 13:34:47 -05008574 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008575 Py_DECREF(unicode);
8576 return -1;
8577 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008578 kind = PyUnicode_KIND(unicode);
8579 data = PyUnicode_DATA(unicode);
8580
Victor Stinnerb84d7232011-11-22 01:50:07 +01008581 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008582 PyObject *exc;
8583 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008585 Py_ssize_t startpos;
8586
8587 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008588
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008590 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008591 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008593 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 decimal = Py_UNICODE_TODECIMAL(ch);
8595 if (decimal >= 0) {
8596 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008597 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 continue;
8599 }
8600 if (0 < ch && ch < 256) {
8601 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008602 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 continue;
8604 }
Victor Stinner6345be92011-11-25 20:09:01 +01008605
Victor Stinner42bf7752011-11-21 22:52:58 +01008606 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008607 exc = NULL;
8608 raise_encode_exception(&exc, "decimal", unicode,
8609 startpos, startpos+1,
8610 "invalid decimal Unicode string");
8611 Py_XDECREF(exc);
8612 Py_DECREF(unicode);
8613 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008614 }
8615 /* 0-terminate the output string */
8616 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008617 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008618 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008619}
8620
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621/* --- Helpers ------------------------------------------------------------ */
8622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008624any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 Py_ssize_t start,
8626 Py_ssize_t end)
8627{
8628 int kind1, kind2, kind;
8629 void *buf1, *buf2;
8630 Py_ssize_t len1, len2, result;
8631
8632 kind1 = PyUnicode_KIND(s1);
8633 kind2 = PyUnicode_KIND(s2);
8634 kind = kind1 > kind2 ? kind1 : kind2;
8635 buf1 = PyUnicode_DATA(s1);
8636 buf2 = PyUnicode_DATA(s2);
8637 if (kind1 != kind)
8638 buf1 = _PyUnicode_AsKind(s1, kind);
8639 if (!buf1)
8640 return -2;
8641 if (kind2 != kind)
8642 buf2 = _PyUnicode_AsKind(s2, kind);
8643 if (!buf2) {
8644 if (kind1 != kind) PyMem_Free(buf1);
8645 return -2;
8646 }
8647 len1 = PyUnicode_GET_LENGTH(s1);
8648 len2 = PyUnicode_GET_LENGTH(s2);
8649
Victor Stinner794d5672011-10-10 03:21:36 +02008650 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008651 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008652 case PyUnicode_1BYTE_KIND:
8653 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8654 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8655 else
8656 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8657 break;
8658 case PyUnicode_2BYTE_KIND:
8659 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8660 break;
8661 case PyUnicode_4BYTE_KIND:
8662 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8663 break;
8664 default:
8665 assert(0); result = -2;
8666 }
8667 }
8668 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008669 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008670 case PyUnicode_1BYTE_KIND:
8671 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8672 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8673 else
8674 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8675 break;
8676 case PyUnicode_2BYTE_KIND:
8677 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8678 break;
8679 case PyUnicode_4BYTE_KIND:
8680 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8681 break;
8682 default:
8683 assert(0); result = -2;
8684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 }
8686
8687 if (kind1 != kind)
8688 PyMem_Free(buf1);
8689 if (kind2 != kind)
8690 PyMem_Free(buf2);
8691
8692 return result;
8693}
8694
8695Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008696_PyUnicode_InsertThousandsGrouping(
8697 PyObject *unicode, Py_ssize_t index,
8698 Py_ssize_t n_buffer,
8699 void *digits, Py_ssize_t n_digits,
8700 Py_ssize_t min_width,
8701 const char *grouping, PyObject *thousands_sep,
8702 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703{
Victor Stinner41a863c2012-02-24 00:37:51 +01008704 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008705 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008706 Py_ssize_t thousands_sep_len;
8707 Py_ssize_t len;
8708
8709 if (unicode != NULL) {
8710 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008711 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008712 }
8713 else {
8714 kind = PyUnicode_1BYTE_KIND;
8715 data = NULL;
8716 }
8717 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8718 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8719 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8720 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008721 if (thousands_sep_kind < kind) {
8722 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8723 if (!thousands_sep_data)
8724 return -1;
8725 }
8726 else {
8727 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8728 if (!data)
8729 return -1;
8730 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008731 }
8732
Benjamin Petersonead6b532011-12-20 17:23:42 -06008733 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008735 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008736 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008737 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008738 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008739 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008740 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008741 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008742 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008743 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008744 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008745 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008747 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008748 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008749 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008750 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008751 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008752 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008753 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008754 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008755 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008756 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008757 break;
8758 default:
8759 assert(0);
8760 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008761 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008762 if (unicode != NULL && thousands_sep_kind != kind) {
8763 if (thousands_sep_kind < kind)
8764 PyMem_Free(thousands_sep_data);
8765 else
8766 PyMem_Free(data);
8767 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008768 if (unicode == NULL) {
8769 *maxchar = 127;
8770 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008771 *maxchar = MAX_MAXCHAR(*maxchar,
8772 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008773 }
8774 }
8775 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776}
8777
8778
Thomas Wouters477c8d52006-05-27 19:21:47 +00008779/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008780#define ADJUST_INDICES(start, end, len) \
8781 if (end > len) \
8782 end = len; \
8783 else if (end < 0) { \
8784 end += len; \
8785 if (end < 0) \
8786 end = 0; \
8787 } \
8788 if (start < 0) { \
8789 start += len; \
8790 if (start < 0) \
8791 start = 0; \
8792 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008793
Alexander Belopolsky40018472011-02-26 01:02:56 +00008794Py_ssize_t
8795PyUnicode_Count(PyObject *str,
8796 PyObject *substr,
8797 Py_ssize_t start,
8798 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008800 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008801 PyObject* str_obj;
8802 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 int kind1, kind2, kind;
8804 void *buf1 = NULL, *buf2 = NULL;
8805 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008806
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008807 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008808 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008810 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008811 if (!sub_obj) {
8812 Py_DECREF(str_obj);
8813 return -1;
8814 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008815 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008816 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008817 Py_DECREF(str_obj);
8818 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819 }
Tim Petersced69f82003-09-16 20:30:58 +00008820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821 kind1 = PyUnicode_KIND(str_obj);
8822 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008823 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008826 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008827 if (kind2 > kind) {
8828 Py_DECREF(sub_obj);
8829 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008830 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008831 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008832 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008833 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 if (!buf2)
8835 goto onError;
8836 len1 = PyUnicode_GET_LENGTH(str_obj);
8837 len2 = PyUnicode_GET_LENGTH(sub_obj);
8838
8839 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008840 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008842 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8843 result = asciilib_count(
8844 ((Py_UCS1*)buf1) + start, end - start,
8845 buf2, len2, PY_SSIZE_T_MAX
8846 );
8847 else
8848 result = ucs1lib_count(
8849 ((Py_UCS1*)buf1) + start, end - start,
8850 buf2, len2, PY_SSIZE_T_MAX
8851 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 break;
8853 case PyUnicode_2BYTE_KIND:
8854 result = ucs2lib_count(
8855 ((Py_UCS2*)buf1) + start, end - start,
8856 buf2, len2, PY_SSIZE_T_MAX
8857 );
8858 break;
8859 case PyUnicode_4BYTE_KIND:
8860 result = ucs4lib_count(
8861 ((Py_UCS4*)buf1) + start, end - start,
8862 buf2, len2, PY_SSIZE_T_MAX
8863 );
8864 break;
8865 default:
8866 assert(0); result = 0;
8867 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008868
8869 Py_DECREF(sub_obj);
8870 Py_DECREF(str_obj);
8871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 if (kind2 != kind)
8873 PyMem_Free(buf2);
8874
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 onError:
8877 Py_DECREF(sub_obj);
8878 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 if (kind2 != kind && buf2)
8880 PyMem_Free(buf2);
8881 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882}
8883
Alexander Belopolsky40018472011-02-26 01:02:56 +00008884Py_ssize_t
8885PyUnicode_Find(PyObject *str,
8886 PyObject *sub,
8887 Py_ssize_t start,
8888 Py_ssize_t end,
8889 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008891 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008892
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008894 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008896 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008897 if (!sub) {
8898 Py_DECREF(str);
8899 return -2;
8900 }
8901 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8902 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 Py_DECREF(str);
8904 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 }
Tim Petersced69f82003-09-16 20:30:58 +00008906
Victor Stinner794d5672011-10-10 03:21:36 +02008907 result = any_find_slice(direction,
8908 str, sub, start, end
8909 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008910
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008912 Py_DECREF(sub);
8913
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914 return result;
8915}
8916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917Py_ssize_t
8918PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8919 Py_ssize_t start, Py_ssize_t end,
8920 int direction)
8921{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008923 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 if (PyUnicode_READY(str) == -1)
8925 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008926 if (start < 0 || end < 0) {
8927 PyErr_SetString(PyExc_IndexError, "string index out of range");
8928 return -2;
8929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 if (end > PyUnicode_GET_LENGTH(str))
8931 end = PyUnicode_GET_LENGTH(str);
8932 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008933 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8934 kind, end-start, ch, direction);
8935 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008937 else
8938 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939}
8940
Alexander Belopolsky40018472011-02-26 01:02:56 +00008941static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008942tailmatch(PyObject *self,
8943 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008944 Py_ssize_t start,
8945 Py_ssize_t end,
8946 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 int kind_self;
8949 int kind_sub;
8950 void *data_self;
8951 void *data_sub;
8952 Py_ssize_t offset;
8953 Py_ssize_t i;
8954 Py_ssize_t end_sub;
8955
8956 if (PyUnicode_READY(self) == -1 ||
8957 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01008958 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959
8960 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961 return 1;
8962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8964 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008966 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968 kind_self = PyUnicode_KIND(self);
8969 data_self = PyUnicode_DATA(self);
8970 kind_sub = PyUnicode_KIND(substring);
8971 data_sub = PyUnicode_DATA(substring);
8972 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8973
8974 if (direction > 0)
8975 offset = end;
8976 else
8977 offset = start;
8978
8979 if (PyUnicode_READ(kind_self, data_self, offset) ==
8980 PyUnicode_READ(kind_sub, data_sub, 0) &&
8981 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8982 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8983 /* If both are of the same kind, memcmp is sufficient */
8984 if (kind_self == kind_sub) {
8985 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008986 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987 data_sub,
8988 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008989 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990 }
8991 /* otherwise we have to compare each character by first accesing it */
8992 else {
8993 /* We do not need to compare 0 and len(substring)-1 because
8994 the if statement above ensured already that they are equal
8995 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008996 for (i = 1; i < end_sub; ++i) {
8997 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8998 PyUnicode_READ(kind_sub, data_sub, i))
8999 return 0;
9000 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003 }
9004
9005 return 0;
9006}
9007
Alexander Belopolsky40018472011-02-26 01:02:56 +00009008Py_ssize_t
9009PyUnicode_Tailmatch(PyObject *str,
9010 PyObject *substr,
9011 Py_ssize_t start,
9012 Py_ssize_t end,
9013 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009015 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009016
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017 str = PyUnicode_FromObject(str);
9018 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009019 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020 substr = PyUnicode_FromObject(substr);
9021 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009022 Py_DECREF(str);
9023 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024 }
Tim Petersced69f82003-09-16 20:30:58 +00009025
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009026 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009027 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028 Py_DECREF(str);
9029 Py_DECREF(substr);
9030 return result;
9031}
9032
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033/* Apply fixfct filter to the Unicode object self and return a
9034 reference to the modified object */
9035
Alexander Belopolsky40018472011-02-26 01:02:56 +00009036static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009037fixup(PyObject *self,
9038 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040 PyObject *u;
9041 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009042 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009044 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009046 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009047 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 /* fix functions return the new maximum character in a string,
9050 if the kind of the resulting unicode object does not change,
9051 everything is fine. Otherwise we need to change the string kind
9052 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009053 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009054
9055 if (maxchar_new == 0) {
9056 /* no changes */;
9057 if (PyUnicode_CheckExact(self)) {
9058 Py_DECREF(u);
9059 Py_INCREF(self);
9060 return self;
9061 }
9062 else
9063 return u;
9064 }
9065
Victor Stinnere6abb482012-05-02 01:15:40 +02009066 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067
Victor Stinnereaab6042011-12-11 22:22:39 +01009068 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009070
9071 /* In case the maximum character changed, we need to
9072 convert the string to the new category. */
9073 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9074 if (v == NULL) {
9075 Py_DECREF(u);
9076 return NULL;
9077 }
9078 if (maxchar_new > maxchar_old) {
9079 /* If the maxchar increased so that the kind changed, not all
9080 characters are representable anymore and we need to fix the
9081 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009082 _PyUnicode_FastCopyCharacters(v, 0,
9083 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009084 maxchar_old = fixfct(v);
9085 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009086 }
9087 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009088 _PyUnicode_FastCopyCharacters(v, 0,
9089 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009091 Py_DECREF(u);
9092 assert(_PyUnicode_CheckConsistency(v, 1));
9093 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094}
9095
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009096static PyObject *
9097ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009099 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9100 char *resdata, *data = PyUnicode_DATA(self);
9101 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009102
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009103 res = PyUnicode_New(len, 127);
9104 if (res == NULL)
9105 return NULL;
9106 resdata = PyUnicode_DATA(res);
9107 if (lower)
9108 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009110 _Py_bytes_upper(resdata, data, len);
9111 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112}
9113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009115handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009117 Py_ssize_t j;
9118 int final_sigma;
9119 Py_UCS4 c;
9120 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009121
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009122 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9123
9124 where ! is a negation and \p{xxx} is a character with property xxx.
9125 */
9126 for (j = i - 1; j >= 0; j--) {
9127 c = PyUnicode_READ(kind, data, j);
9128 if (!_PyUnicode_IsCaseIgnorable(c))
9129 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009131 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9132 if (final_sigma) {
9133 for (j = i + 1; j < length; j++) {
9134 c = PyUnicode_READ(kind, data, j);
9135 if (!_PyUnicode_IsCaseIgnorable(c))
9136 break;
9137 }
9138 final_sigma = j == length || !_PyUnicode_IsCased(c);
9139 }
9140 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141}
9142
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009143static int
9144lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9145 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009147 /* Obscure special case. */
9148 if (c == 0x3A3) {
9149 mapped[0] = handle_capital_sigma(kind, data, length, i);
9150 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009152 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153}
9154
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009155static Py_ssize_t
9156do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009158 Py_ssize_t i, k = 0;
9159 int n_res, j;
9160 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009161
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009162 c = PyUnicode_READ(kind, data, 0);
9163 n_res = _PyUnicode_ToUpperFull(c, mapped);
9164 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009165 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009166 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009168 for (i = 1; i < length; i++) {
9169 c = PyUnicode_READ(kind, data, i);
9170 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9171 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009172 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009173 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009174 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009175 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009176 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177}
9178
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009179static Py_ssize_t
9180do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9181 Py_ssize_t i, k = 0;
9182
9183 for (i = 0; i < length; i++) {
9184 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9185 int n_res, j;
9186 if (Py_UNICODE_ISUPPER(c)) {
9187 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9188 }
9189 else if (Py_UNICODE_ISLOWER(c)) {
9190 n_res = _PyUnicode_ToUpperFull(c, mapped);
9191 }
9192 else {
9193 n_res = 1;
9194 mapped[0] = c;
9195 }
9196 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009197 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009198 res[k++] = mapped[j];
9199 }
9200 }
9201 return k;
9202}
9203
9204static Py_ssize_t
9205do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9206 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009208 Py_ssize_t i, k = 0;
9209
9210 for (i = 0; i < length; i++) {
9211 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9212 int n_res, j;
9213 if (lower)
9214 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9215 else
9216 n_res = _PyUnicode_ToUpperFull(c, mapped);
9217 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009218 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009219 res[k++] = mapped[j];
9220 }
9221 }
9222 return k;
9223}
9224
9225static Py_ssize_t
9226do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9227{
9228 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9229}
9230
9231static Py_ssize_t
9232do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9233{
9234 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9235}
9236
Benjamin Petersone51757f2012-01-12 21:10:29 -05009237static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009238do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9239{
9240 Py_ssize_t i, k = 0;
9241
9242 for (i = 0; i < length; i++) {
9243 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9244 Py_UCS4 mapped[3];
9245 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9246 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009247 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009248 res[k++] = mapped[j];
9249 }
9250 }
9251 return k;
9252}
9253
9254static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009255do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9256{
9257 Py_ssize_t i, k = 0;
9258 int previous_is_cased;
9259
9260 previous_is_cased = 0;
9261 for (i = 0; i < length; i++) {
9262 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9263 Py_UCS4 mapped[3];
9264 int n_res, j;
9265
9266 if (previous_is_cased)
9267 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9268 else
9269 n_res = _PyUnicode_ToTitleFull(c, mapped);
9270
9271 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009272 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009273 res[k++] = mapped[j];
9274 }
9275
9276 previous_is_cased = _PyUnicode_IsCased(c);
9277 }
9278 return k;
9279}
9280
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009281static PyObject *
9282case_operation(PyObject *self,
9283 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9284{
9285 PyObject *res = NULL;
9286 Py_ssize_t length, newlength = 0;
9287 int kind, outkind;
9288 void *data, *outdata;
9289 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9290
Benjamin Petersoneea48462012-01-16 14:28:50 -05009291 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009292
9293 kind = PyUnicode_KIND(self);
9294 data = PyUnicode_DATA(self);
9295 length = PyUnicode_GET_LENGTH(self);
9296 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9297 if (tmp == NULL)
9298 return PyErr_NoMemory();
9299 newlength = perform(kind, data, length, tmp, &maxchar);
9300 res = PyUnicode_New(newlength, maxchar);
9301 if (res == NULL)
9302 goto leave;
9303 tmpend = tmp + newlength;
9304 outdata = PyUnicode_DATA(res);
9305 outkind = PyUnicode_KIND(res);
9306 switch (outkind) {
9307 case PyUnicode_1BYTE_KIND:
9308 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9309 break;
9310 case PyUnicode_2BYTE_KIND:
9311 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9312 break;
9313 case PyUnicode_4BYTE_KIND:
9314 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9315 break;
9316 default:
9317 assert(0);
9318 break;
9319 }
9320 leave:
9321 PyMem_FREE(tmp);
9322 return res;
9323}
9324
Tim Peters8ce9f162004-08-27 01:49:32 +00009325PyObject *
9326PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009327{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009329 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009331 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009332 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9333 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009334 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009335 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009336 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009338 int use_memcpy;
9339 unsigned char *res_data = NULL, *sep_data = NULL;
9340 PyObject *last_obj;
9341 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342
Tim Peters05eba1f2004-08-27 21:32:02 +00009343 fseq = PySequence_Fast(seq, "");
9344 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009345 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009346 }
9347
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009348 /* NOTE: the following code can't call back into Python code,
9349 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009350 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009351
Tim Peters05eba1f2004-08-27 21:32:02 +00009352 seqlen = PySequence_Fast_GET_SIZE(fseq);
9353 /* If empty sequence, return u"". */
9354 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009355 Py_DECREF(fseq);
9356 Py_INCREF(unicode_empty);
9357 res = unicode_empty;
9358 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009359 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009360
Tim Peters05eba1f2004-08-27 21:32:02 +00009361 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009362 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009363 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009364 if (seqlen == 1) {
9365 if (PyUnicode_CheckExact(items[0])) {
9366 res = items[0];
9367 Py_INCREF(res);
9368 Py_DECREF(fseq);
9369 return res;
9370 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009371 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009372 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009373 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009374 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009375 /* Set up sep and seplen */
9376 if (separator == NULL) {
9377 /* fall back to a blank space separator */
9378 sep = PyUnicode_FromOrdinal(' ');
9379 if (!sep)
9380 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009381 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009382 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009383 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009384 else {
9385 if (!PyUnicode_Check(separator)) {
9386 PyErr_Format(PyExc_TypeError,
9387 "separator: expected str instance,"
9388 " %.80s found",
9389 Py_TYPE(separator)->tp_name);
9390 goto onError;
9391 }
9392 if (PyUnicode_READY(separator))
9393 goto onError;
9394 sep = separator;
9395 seplen = PyUnicode_GET_LENGTH(separator);
9396 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9397 /* inc refcount to keep this code path symmetric with the
9398 above case of a blank separator */
9399 Py_INCREF(sep);
9400 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009401 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009402 }
9403
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009404 /* There are at least two things to join, or else we have a subclass
9405 * of str in the sequence.
9406 * Do a pre-pass to figure out the total amount of space we'll
9407 * need (sz), and see whether all argument are strings.
9408 */
9409 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009410#ifdef Py_DEBUG
9411 use_memcpy = 0;
9412#else
9413 use_memcpy = 1;
9414#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009415 for (i = 0; i < seqlen; i++) {
9416 const Py_ssize_t old_sz = sz;
9417 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009418 if (!PyUnicode_Check(item)) {
9419 PyErr_Format(PyExc_TypeError,
9420 "sequence item %zd: expected str instance,"
9421 " %.80s found",
9422 i, Py_TYPE(item)->tp_name);
9423 goto onError;
9424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 if (PyUnicode_READY(item) == -1)
9426 goto onError;
9427 sz += PyUnicode_GET_LENGTH(item);
9428 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009429 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009430 if (i != 0)
9431 sz += seplen;
9432 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9433 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009434 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009435 goto onError;
9436 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009437 if (use_memcpy && last_obj != NULL) {
9438 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9439 use_memcpy = 0;
9440 }
9441 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009442 }
Tim Petersced69f82003-09-16 20:30:58 +00009443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009445 if (res == NULL)
9446 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009447
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009448 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009449#ifdef Py_DEBUG
9450 use_memcpy = 0;
9451#else
9452 if (use_memcpy) {
9453 res_data = PyUnicode_1BYTE_DATA(res);
9454 kind = PyUnicode_KIND(res);
9455 if (seplen != 0)
9456 sep_data = PyUnicode_1BYTE_DATA(sep);
9457 }
9458#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009460 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009461 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009462 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009463 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009464 if (use_memcpy) {
9465 Py_MEMCPY(res_data,
9466 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009467 kind * seplen);
9468 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009469 }
9470 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009471 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009472 res_offset += seplen;
9473 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009474 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009475 itemlen = PyUnicode_GET_LENGTH(item);
9476 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009477 if (use_memcpy) {
9478 Py_MEMCPY(res_data,
9479 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009480 kind * itemlen);
9481 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009482 }
9483 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009484 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009485 res_offset += itemlen;
9486 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009487 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009488 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009489 if (use_memcpy)
9490 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009491 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009492 else
9493 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009494
Tim Peters05eba1f2004-08-27 21:32:02 +00009495 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009497 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499
Benjamin Peterson29060642009-01-31 22:14:21 +00009500 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009501 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009503 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504 return NULL;
9505}
9506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507#define FILL(kind, data, value, start, length) \
9508 do { \
9509 Py_ssize_t i_ = 0; \
9510 assert(kind != PyUnicode_WCHAR_KIND); \
9511 switch ((kind)) { \
9512 case PyUnicode_1BYTE_KIND: { \
9513 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009514 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515 break; \
9516 } \
9517 case PyUnicode_2BYTE_KIND: { \
9518 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9519 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9520 break; \
9521 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009522 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9524 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9525 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009526 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527 } \
9528 } \
9529 } while (0)
9530
Victor Stinnerd3f08822012-05-29 12:57:52 +02009531void
9532_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9533 Py_UCS4 fill_char)
9534{
9535 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9536 const void *data = PyUnicode_DATA(unicode);
9537 assert(PyUnicode_IS_READY(unicode));
9538 assert(unicode_modifiable(unicode));
9539 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9540 assert(start >= 0);
9541 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9542 FILL(kind, data, fill_char, start, length);
9543}
9544
Victor Stinner3fe55312012-01-04 00:33:50 +01009545Py_ssize_t
9546PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9547 Py_UCS4 fill_char)
9548{
9549 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009550
9551 if (!PyUnicode_Check(unicode)) {
9552 PyErr_BadInternalCall();
9553 return -1;
9554 }
9555 if (PyUnicode_READY(unicode) == -1)
9556 return -1;
9557 if (unicode_check_modifiable(unicode))
9558 return -1;
9559
Victor Stinnerd3f08822012-05-29 12:57:52 +02009560 if (start < 0) {
9561 PyErr_SetString(PyExc_IndexError, "string index out of range");
9562 return -1;
9563 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009564 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9565 PyErr_SetString(PyExc_ValueError,
9566 "fill character is bigger than "
9567 "the string maximum character");
9568 return -1;
9569 }
9570
9571 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9572 length = Py_MIN(maxlen, length);
9573 if (length <= 0)
9574 return 0;
9575
Victor Stinnerd3f08822012-05-29 12:57:52 +02009576 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009577 return length;
9578}
9579
Victor Stinner9310abb2011-10-05 00:59:23 +02009580static PyObject *
9581pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009582 Py_ssize_t left,
9583 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 PyObject *u;
9587 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009588 int kind;
9589 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590
9591 if (left < 0)
9592 left = 0;
9593 if (right < 0)
9594 right = 0;
9595
Victor Stinnerc4b49542011-12-11 22:44:26 +01009596 if (left == 0 && right == 0)
9597 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9600 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009601 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9602 return NULL;
9603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009605 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009607 if (!u)
9608 return NULL;
9609
9610 kind = PyUnicode_KIND(u);
9611 data = PyUnicode_DATA(u);
9612 if (left)
9613 FILL(kind, data, fill, 0, left);
9614 if (right)
9615 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009616 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009617 assert(_PyUnicode_CheckConsistency(u, 1));
9618 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619}
9620
Alexander Belopolsky40018472011-02-26 01:02:56 +00009621PyObject *
9622PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625
9626 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009627 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009629 if (PyUnicode_READY(string) == -1) {
9630 Py_DECREF(string);
9631 return NULL;
9632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633
Benjamin Petersonead6b532011-12-20 17:23:42 -06009634 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009636 if (PyUnicode_IS_ASCII(string))
9637 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009638 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009639 PyUnicode_GET_LENGTH(string), keepends);
9640 else
9641 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009642 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009643 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 break;
9645 case PyUnicode_2BYTE_KIND:
9646 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009647 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 PyUnicode_GET_LENGTH(string), keepends);
9649 break;
9650 case PyUnicode_4BYTE_KIND:
9651 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009652 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 PyUnicode_GET_LENGTH(string), keepends);
9654 break;
9655 default:
9656 assert(0);
9657 list = 0;
9658 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659 Py_DECREF(string);
9660 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661}
9662
Alexander Belopolsky40018472011-02-26 01:02:56 +00009663static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009664split(PyObject *self,
9665 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009666 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 int kind1, kind2, kind;
9669 void *buf1, *buf2;
9670 Py_ssize_t len1, len2;
9671 PyObject* out;
9672
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009674 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 if (PyUnicode_READY(self) == -1)
9677 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009680 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009681 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009682 if (PyUnicode_IS_ASCII(self))
9683 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009684 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009685 PyUnicode_GET_LENGTH(self), maxcount
9686 );
9687 else
9688 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009689 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009690 PyUnicode_GET_LENGTH(self), maxcount
9691 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 case PyUnicode_2BYTE_KIND:
9693 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009694 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 PyUnicode_GET_LENGTH(self), maxcount
9696 );
9697 case PyUnicode_4BYTE_KIND:
9698 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009699 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 PyUnicode_GET_LENGTH(self), maxcount
9701 );
9702 default:
9703 assert(0);
9704 return NULL;
9705 }
9706
9707 if (PyUnicode_READY(substring) == -1)
9708 return NULL;
9709
9710 kind1 = PyUnicode_KIND(self);
9711 kind2 = PyUnicode_KIND(substring);
9712 kind = kind1 > kind2 ? kind1 : kind2;
9713 buf1 = PyUnicode_DATA(self);
9714 buf2 = PyUnicode_DATA(substring);
9715 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009716 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 if (!buf1)
9718 return NULL;
9719 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009720 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 if (!buf2) {
9722 if (kind1 != kind) PyMem_Free(buf1);
9723 return NULL;
9724 }
9725 len1 = PyUnicode_GET_LENGTH(self);
9726 len2 = PyUnicode_GET_LENGTH(substring);
9727
Benjamin Petersonead6b532011-12-20 17:23:42 -06009728 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009730 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9731 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009732 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009733 else
9734 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009735 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 break;
9737 case PyUnicode_2BYTE_KIND:
9738 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009739 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 break;
9741 case PyUnicode_4BYTE_KIND:
9742 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009743 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009744 break;
9745 default:
9746 out = NULL;
9747 }
9748 if (kind1 != kind)
9749 PyMem_Free(buf1);
9750 if (kind2 != kind)
9751 PyMem_Free(buf2);
9752 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753}
9754
Alexander Belopolsky40018472011-02-26 01:02:56 +00009755static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009756rsplit(PyObject *self,
9757 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009758 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009759{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 int kind1, kind2, kind;
9761 void *buf1, *buf2;
9762 Py_ssize_t len1, len2;
9763 PyObject* out;
9764
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009765 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009766 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 if (PyUnicode_READY(self) == -1)
9769 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009771 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009772 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009774 if (PyUnicode_IS_ASCII(self))
9775 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009776 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009777 PyUnicode_GET_LENGTH(self), maxcount
9778 );
9779 else
9780 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009781 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009782 PyUnicode_GET_LENGTH(self), maxcount
9783 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 case PyUnicode_2BYTE_KIND:
9785 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009786 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 PyUnicode_GET_LENGTH(self), maxcount
9788 );
9789 case PyUnicode_4BYTE_KIND:
9790 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009791 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 PyUnicode_GET_LENGTH(self), maxcount
9793 );
9794 default:
9795 assert(0);
9796 return NULL;
9797 }
9798
9799 if (PyUnicode_READY(substring) == -1)
9800 return NULL;
9801
9802 kind1 = PyUnicode_KIND(self);
9803 kind2 = PyUnicode_KIND(substring);
9804 kind = kind1 > kind2 ? kind1 : kind2;
9805 buf1 = PyUnicode_DATA(self);
9806 buf2 = PyUnicode_DATA(substring);
9807 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009808 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 if (!buf1)
9810 return NULL;
9811 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009812 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 if (!buf2) {
9814 if (kind1 != kind) PyMem_Free(buf1);
9815 return NULL;
9816 }
9817 len1 = PyUnicode_GET_LENGTH(self);
9818 len2 = PyUnicode_GET_LENGTH(substring);
9819
Benjamin Petersonead6b532011-12-20 17:23:42 -06009820 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009822 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9823 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009824 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009825 else
9826 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009827 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 break;
9829 case PyUnicode_2BYTE_KIND:
9830 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009831 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 break;
9833 case PyUnicode_4BYTE_KIND:
9834 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009835 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 break;
9837 default:
9838 out = NULL;
9839 }
9840 if (kind1 != kind)
9841 PyMem_Free(buf1);
9842 if (kind2 != kind)
9843 PyMem_Free(buf2);
9844 return out;
9845}
9846
9847static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009848anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9849 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009851 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009853 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9854 return asciilib_find(buf1, len1, buf2, len2, offset);
9855 else
9856 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 case PyUnicode_2BYTE_KIND:
9858 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9859 case PyUnicode_4BYTE_KIND:
9860 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9861 }
9862 assert(0);
9863 return -1;
9864}
9865
9866static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009867anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9868 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009870 switch (kind) {
9871 case PyUnicode_1BYTE_KIND:
9872 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9873 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9874 else
9875 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9876 case PyUnicode_2BYTE_KIND:
9877 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9878 case PyUnicode_4BYTE_KIND:
9879 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9880 }
9881 assert(0);
9882 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009883}
9884
Alexander Belopolsky40018472011-02-26 01:02:56 +00009885static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886replace(PyObject *self, PyObject *str1,
9887 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 PyObject *u;
9890 char *sbuf = PyUnicode_DATA(self);
9891 char *buf1 = PyUnicode_DATA(str1);
9892 char *buf2 = PyUnicode_DATA(str2);
9893 int srelease = 0, release1 = 0, release2 = 0;
9894 int skind = PyUnicode_KIND(self);
9895 int kind1 = PyUnicode_KIND(str1);
9896 int kind2 = PyUnicode_KIND(str2);
9897 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9898 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9899 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009900 int mayshrink;
9901 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902
9903 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009904 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009906 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907
Victor Stinner59de0ee2011-10-07 10:01:28 +02009908 if (str1 == str2)
9909 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 if (skind < kind1)
9911 /* substring too wide to be present */
9912 goto nothing;
9913
Victor Stinner49a0a212011-10-12 23:46:10 +02009914 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9915 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9916 /* Replacing str1 with str2 may cause a maxchar reduction in the
9917 result string. */
9918 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +02009919 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009922 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009924 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009926 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009927 Py_UCS4 u1, u2;
9928 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +01009929 Py_ssize_t index, pos;
9930 char *src;
9931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +01009933 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9934 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009935 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009938 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +02009940 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +01009942
9943 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9944 index = 0;
9945 src = sbuf;
9946 while (--maxcount)
9947 {
9948 pos++;
9949 src += pos * PyUnicode_KIND(self);
9950 slen -= pos;
9951 index += pos;
9952 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9953 if (pos < 0)
9954 break;
9955 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9956 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009957 }
9958 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 int rkind = skind;
9960 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009961 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 if (kind1 < rkind) {
9964 /* widen substring */
9965 buf1 = _PyUnicode_AsKind(str1, rkind);
9966 if (!buf1) goto error;
9967 release1 = 1;
9968 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009969 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009970 if (i < 0)
9971 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 if (rkind > kind2) {
9973 /* widen replacement */
9974 buf2 = _PyUnicode_AsKind(str2, rkind);
9975 if (!buf2) goto error;
9976 release2 = 1;
9977 }
9978 else if (rkind < kind2) {
9979 /* widen self and buf1 */
9980 rkind = kind2;
9981 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +01009982 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 sbuf = _PyUnicode_AsKind(self, rkind);
9984 if (!sbuf) goto error;
9985 srelease = 1;
9986 buf1 = _PyUnicode_AsKind(str1, rkind);
9987 if (!buf1) goto error;
9988 release1 = 1;
9989 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009990 u = PyUnicode_New(slen, maxchar);
9991 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009993 assert(PyUnicode_KIND(u) == rkind);
9994 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009995
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009996 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009997 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009998 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010000 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010002
10003 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010004 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010005 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010006 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010007 if (i == -1)
10008 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010009 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010011 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010015 }
10016 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010018 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 int rkind = skind;
10020 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010023 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 buf1 = _PyUnicode_AsKind(str1, rkind);
10025 if (!buf1) goto error;
10026 release1 = 1;
10027 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010028 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010029 if (n == 0)
10030 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010032 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 buf2 = _PyUnicode_AsKind(str2, rkind);
10034 if (!buf2) goto error;
10035 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010038 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 rkind = kind2;
10040 sbuf = _PyUnicode_AsKind(self, rkind);
10041 if (!sbuf) goto error;
10042 srelease = 1;
10043 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010044 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 buf1 = _PyUnicode_AsKind(str1, rkind);
10046 if (!buf1) goto error;
10047 release1 = 1;
10048 }
10049 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10050 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010051 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 PyErr_SetString(PyExc_OverflowError,
10053 "replace string is too long");
10054 goto error;
10055 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010056 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010057 if (new_size == 0) {
10058 Py_INCREF(unicode_empty);
10059 u = unicode_empty;
10060 goto done;
10061 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010062 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 PyErr_SetString(PyExc_OverflowError,
10064 "replace string is too long");
10065 goto error;
10066 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010067 u = PyUnicode_New(new_size, maxchar);
10068 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010070 assert(PyUnicode_KIND(u) == rkind);
10071 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 ires = i = 0;
10073 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010074 while (n-- > 0) {
10075 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010076 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010077 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010078 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010079 if (j == -1)
10080 break;
10081 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010082 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010083 memcpy(res + rkind * ires,
10084 sbuf + rkind * i,
10085 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010087 }
10088 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010090 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010092 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010098 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010099 memcpy(res + rkind * ires,
10100 sbuf + rkind * i,
10101 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010102 }
10103 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010104 /* interleave */
10105 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010106 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010108 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010110 if (--n <= 0)
10111 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010112 memcpy(res + rkind * ires,
10113 sbuf + rkind * i,
10114 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 ires++;
10116 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010117 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010118 memcpy(res + rkind * ires,
10119 sbuf + rkind * i,
10120 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010121 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010122 }
10123
10124 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010125 unicode_adjust_maxchar(&u);
10126 if (u == NULL)
10127 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010129
10130 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 if (srelease)
10132 PyMem_FREE(sbuf);
10133 if (release1)
10134 PyMem_FREE(buf1);
10135 if (release2)
10136 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010137 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010139
Benjamin Peterson29060642009-01-31 22:14:21 +000010140 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010141 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 if (srelease)
10143 PyMem_FREE(sbuf);
10144 if (release1)
10145 PyMem_FREE(buf1);
10146 if (release2)
10147 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010148 return unicode_result_unchanged(self);
10149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 error:
10151 if (srelease && sbuf)
10152 PyMem_FREE(sbuf);
10153 if (release1 && buf1)
10154 PyMem_FREE(buf1);
10155 if (release2 && buf2)
10156 PyMem_FREE(buf2);
10157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158}
10159
10160/* --- Unicode Object Methods --------------------------------------------- */
10161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010162PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010163 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164\n\
10165Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010166characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167
10168static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010169unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010171 if (PyUnicode_READY(self) == -1)
10172 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010173 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174}
10175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010176PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010177 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178\n\
10179Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010180have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181
10182static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010183unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010185 if (PyUnicode_READY(self) == -1)
10186 return NULL;
10187 if (PyUnicode_GET_LENGTH(self) == 0)
10188 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010189 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010190}
10191
Benjamin Petersond5890c82012-01-14 13:23:30 -050010192PyDoc_STRVAR(casefold__doc__,
10193 "S.casefold() -> str\n\
10194\n\
10195Return a version of S suitable for caseless comparisons.");
10196
10197static PyObject *
10198unicode_casefold(PyObject *self)
10199{
10200 if (PyUnicode_READY(self) == -1)
10201 return NULL;
10202 if (PyUnicode_IS_ASCII(self))
10203 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010204 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010205}
10206
10207
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010208/* Argument converter. Coerces to a single unicode character */
10209
10210static int
10211convert_uc(PyObject *obj, void *addr)
10212{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010214 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010215
Benjamin Peterson14339b62009-01-31 16:36:08 +000010216 uniobj = PyUnicode_FromObject(obj);
10217 if (uniobj == NULL) {
10218 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010219 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010220 return 0;
10221 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010223 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010224 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010225 Py_DECREF(uniobj);
10226 return 0;
10227 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010229 Py_DECREF(uniobj);
10230 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010231}
10232
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010233PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010234 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010236Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010237done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238
10239static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010240unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010242 Py_ssize_t marg, left;
10243 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 Py_UCS4 fillchar = ' ';
10245
Victor Stinnere9a29352011-10-01 02:14:59 +020010246 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248
Benjamin Petersonbac79492012-01-14 13:34:47 -050010249 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250 return NULL;
10251
Victor Stinnerc4b49542011-12-11 22:44:26 +010010252 if (PyUnicode_GET_LENGTH(self) >= width)
10253 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254
Victor Stinnerc4b49542011-12-11 22:44:26 +010010255 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256 left = marg / 2 + (marg & width & 1);
10257
Victor Stinner9310abb2011-10-05 00:59:23 +020010258 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010259}
10260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261/* This function assumes that str1 and str2 are readied by the caller. */
10262
Marc-André Lemburge5034372000-08-08 08:04:29 +000010263static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010264unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 int kind1, kind2;
10267 void *data1, *data2;
Victor Stinner770e19e2012-10-04 22:59:45 +020010268 Py_ssize_t len1, len2;
10269 Py_ssize_t i, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010270
Victor Stinner90db9c42012-10-04 21:53:50 +020010271 /* a string is equal to itself */
10272 if (str1 == str2)
10273 return 0;
10274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 kind1 = PyUnicode_KIND(str1);
10276 kind2 = PyUnicode_KIND(str2);
10277 data1 = PyUnicode_DATA(str1);
10278 data2 = PyUnicode_DATA(str2);
10279 len1 = PyUnicode_GET_LENGTH(str1);
10280 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010281 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010282
Victor Stinner770e19e2012-10-04 22:59:45 +020010283 if (kind1 == 1 && kind2 == 1) {
10284 int cmp = memcmp(data1, data2, len);
10285 /* normalize result of memcmp() into the range [-1; 1] */
10286 if (cmp < 0)
10287 return -1;
10288 if (cmp > 0)
10289 return 1;
10290 }
10291 else {
10292 for (i = 0; i < len; ++i) {
10293 Py_UCS4 c1, c2;
10294 c1 = PyUnicode_READ(kind1, data1, i);
10295 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010296
Victor Stinner770e19e2012-10-04 22:59:45 +020010297 if (c1 != c2)
10298 return (c1 < c2) ? -1 : 1;
10299 }
Marc-André Lemburge5034372000-08-08 08:04:29 +000010300 }
10301
Victor Stinner770e19e2012-10-04 22:59:45 +020010302 if (len1 == len2)
10303 return 0;
10304 if (len1 < len2)
10305 return -1;
10306 else
10307 return 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010308}
10309
Victor Stinnere5567ad2012-10-23 02:48:49 +020010310static int
10311unicode_compare_eq(PyObject *str1, PyObject *str2)
10312{
10313 int kind;
10314 void *data1, *data2;
10315 Py_ssize_t len;
10316 int cmp;
10317
10318 /* a string is equal to itself */
10319 if (str1 == str2)
10320 return 1;
10321
10322 len = PyUnicode_GET_LENGTH(str1);
10323 if (PyUnicode_GET_LENGTH(str2) != len)
10324 return 0;
10325 kind = PyUnicode_KIND(str1);
10326 if (PyUnicode_KIND(str2) != kind)
10327 return 0;
10328 data1 = PyUnicode_DATA(str1);
10329 data2 = PyUnicode_DATA(str2);
10330
10331 cmp = memcmp(data1, data2, len * kind);
10332 return (cmp == 0);
10333}
10334
10335
Alexander Belopolsky40018472011-02-26 01:02:56 +000010336int
10337PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010338{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10340 if (PyUnicode_READY(left) == -1 ||
10341 PyUnicode_READY(right) == -1)
10342 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010343 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010345 PyErr_Format(PyExc_TypeError,
10346 "Can't compare %.100s and %.100s",
10347 left->ob_type->tp_name,
10348 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349 return -1;
10350}
10351
Martin v. Löwis5b222132007-06-10 09:51:05 +000010352int
10353PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 Py_ssize_t i;
10356 int kind;
10357 void *data;
10358 Py_UCS4 chr;
10359
Victor Stinner910337b2011-10-03 03:20:16 +020010360 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 if (PyUnicode_READY(uni) == -1)
10362 return -1;
10363 kind = PyUnicode_KIND(uni);
10364 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010365 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10367 if (chr != str[i])
10368 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010369 /* This check keeps Python strings that end in '\0' from comparing equal
10370 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010372 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010373 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010374 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010375 return 0;
10376}
10377
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010378
Benjamin Peterson29060642009-01-31 22:14:21 +000010379#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010380 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010381
Alexander Belopolsky40018472011-02-26 01:02:56 +000010382PyObject *
10383PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010384{
10385 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010386 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010387
Victor Stinnere5567ad2012-10-23 02:48:49 +020010388 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10389 Py_RETURN_NOTIMPLEMENTED;
10390
10391 if (PyUnicode_READY(left) == -1 ||
10392 PyUnicode_READY(right) == -1)
10393 return NULL;
10394
10395 if (op == Py_EQ || op == Py_NE) {
10396 result = unicode_compare_eq(left, right);
10397 if (op == Py_EQ)
10398 v = TEST_COND(result);
10399 else
10400 v = TEST_COND(!result);
10401 }
10402 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010403 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010404
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010405 /* Convert the return value to a Boolean */
10406 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010407 case Py_LE:
10408 v = TEST_COND(result <= 0);
10409 break;
10410 case Py_GE:
10411 v = TEST_COND(result >= 0);
10412 break;
10413 case Py_LT:
10414 v = TEST_COND(result == -1);
10415 break;
10416 case Py_GT:
10417 v = TEST_COND(result == 1);
10418 break;
10419 default:
10420 PyErr_BadArgument();
10421 return NULL;
10422 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010423 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010424 Py_INCREF(v);
10425 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010426}
10427
Alexander Belopolsky40018472011-02-26 01:02:56 +000010428int
10429PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010430{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010431 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 int kind1, kind2, kind;
10433 void *buf1, *buf2;
10434 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010435 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010436
10437 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010438 sub = PyUnicode_FromObject(element);
10439 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010440 PyErr_Format(PyExc_TypeError,
10441 "'in <string>' requires string as left operand, not %s",
10442 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010443 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010444 }
10445
Thomas Wouters477c8d52006-05-27 19:21:47 +000010446 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010447 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010448 Py_DECREF(sub);
10449 return -1;
10450 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010451 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10452 Py_DECREF(sub);
10453 Py_DECREF(str);
10454 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 kind1 = PyUnicode_KIND(str);
10457 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010458 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 buf1 = PyUnicode_DATA(str);
10460 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010461 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010462 if (kind2 > kind) {
10463 Py_DECREF(sub);
10464 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010465 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010466 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010467 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 if (!buf2) {
10470 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010471 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 return -1;
10473 }
10474 len1 = PyUnicode_GET_LENGTH(str);
10475 len2 = PyUnicode_GET_LENGTH(sub);
10476
Benjamin Petersonead6b532011-12-20 17:23:42 -060010477 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 case PyUnicode_1BYTE_KIND:
10479 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10480 break;
10481 case PyUnicode_2BYTE_KIND:
10482 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10483 break;
10484 case PyUnicode_4BYTE_KIND:
10485 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10486 break;
10487 default:
10488 result = -1;
10489 assert(0);
10490 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010491
10492 Py_DECREF(str);
10493 Py_DECREF(sub);
10494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 if (kind2 != kind)
10496 PyMem_Free(buf2);
10497
Guido van Rossum403d68b2000-03-13 15:55:09 +000010498 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010499}
10500
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501/* Concat to string or Unicode object giving a new Unicode object. */
10502
Alexander Belopolsky40018472011-02-26 01:02:56 +000010503PyObject *
10504PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010507 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010508 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509
10510 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010512 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010513 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010516 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517
10518 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010519 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010520 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010523 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010524 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526 }
10527
Victor Stinner488fa492011-12-12 00:01:39 +010010528 u_len = PyUnicode_GET_LENGTH(u);
10529 v_len = PyUnicode_GET_LENGTH(v);
10530 if (u_len > PY_SSIZE_T_MAX - v_len) {
10531 PyErr_SetString(PyExc_OverflowError,
10532 "strings are too large to concat");
10533 goto onError;
10534 }
10535 new_len = u_len + v_len;
10536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010538 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010539 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010542 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010544 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010545 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10546 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547 Py_DECREF(u);
10548 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010549 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551
Benjamin Peterson29060642009-01-31 22:14:21 +000010552 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010553 Py_XDECREF(u);
10554 Py_XDECREF(v);
10555 return NULL;
10556}
10557
Walter Dörwald1ab83302007-05-18 17:15:44 +000010558void
Victor Stinner23e56682011-10-03 03:54:37 +020010559PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010560{
Victor Stinner23e56682011-10-03 03:54:37 +020010561 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010562 Py_UCS4 maxchar, maxchar2;
10563 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010564
10565 if (p_left == NULL) {
10566 if (!PyErr_Occurred())
10567 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010568 return;
10569 }
Victor Stinner23e56682011-10-03 03:54:37 +020010570 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010571 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010572 if (!PyErr_Occurred())
10573 PyErr_BadInternalCall();
10574 goto error;
10575 }
10576
Benjamin Petersonbac79492012-01-14 13:34:47 -050010577 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010578 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010579 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010580 goto error;
10581
Victor Stinner488fa492011-12-12 00:01:39 +010010582 /* Shortcuts */
10583 if (left == unicode_empty) {
10584 Py_DECREF(left);
10585 Py_INCREF(right);
10586 *p_left = right;
10587 return;
10588 }
10589 if (right == unicode_empty)
10590 return;
10591
10592 left_len = PyUnicode_GET_LENGTH(left);
10593 right_len = PyUnicode_GET_LENGTH(right);
10594 if (left_len > PY_SSIZE_T_MAX - right_len) {
10595 PyErr_SetString(PyExc_OverflowError,
10596 "strings are too large to concat");
10597 goto error;
10598 }
10599 new_len = left_len + right_len;
10600
10601 if (unicode_modifiable(left)
10602 && PyUnicode_CheckExact(right)
10603 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010604 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10605 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010606 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010607 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010608 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10609 {
10610 /* append inplace */
10611 if (unicode_resize(p_left, new_len) != 0) {
10612 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10613 * deallocated so it cannot be put back into
10614 * 'variable'. The MemoryError is raised when there
10615 * is no value in 'variable', which might (very
10616 * remotely) be a cause of incompatibilities.
10617 */
10618 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010619 }
Victor Stinner488fa492011-12-12 00:01:39 +010010620 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010621 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010622 }
Victor Stinner488fa492011-12-12 00:01:39 +010010623 else {
10624 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10625 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010626 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010627
Victor Stinner488fa492011-12-12 00:01:39 +010010628 /* Concat the two Unicode strings */
10629 res = PyUnicode_New(new_len, maxchar);
10630 if (res == NULL)
10631 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010632 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10633 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010634 Py_DECREF(left);
10635 *p_left = res;
10636 }
10637 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010638 return;
10639
10640error:
Victor Stinner488fa492011-12-12 00:01:39 +010010641 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010642}
10643
10644void
10645PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10646{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010647 PyUnicode_Append(pleft, right);
10648 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010649}
10650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010651PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010652 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010653\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010655string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010656interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657
10658static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010659unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010661 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010662 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010663 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 int kind1, kind2, kind;
10666 void *buf1, *buf2;
10667 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668
Jesus Ceaac451502011-04-20 17:09:23 +020010669 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10670 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010671 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 kind1 = PyUnicode_KIND(self);
10674 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010675 if (kind2 > kind1)
10676 return PyLong_FromLong(0);
10677 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 buf1 = PyUnicode_DATA(self);
10679 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010681 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 if (!buf2) {
10683 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 return NULL;
10685 }
10686 len1 = PyUnicode_GET_LENGTH(self);
10687 len2 = PyUnicode_GET_LENGTH(substring);
10688
10689 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010690 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 case PyUnicode_1BYTE_KIND:
10692 iresult = ucs1lib_count(
10693 ((Py_UCS1*)buf1) + start, end - start,
10694 buf2, len2, PY_SSIZE_T_MAX
10695 );
10696 break;
10697 case PyUnicode_2BYTE_KIND:
10698 iresult = ucs2lib_count(
10699 ((Py_UCS2*)buf1) + start, end - start,
10700 buf2, len2, PY_SSIZE_T_MAX
10701 );
10702 break;
10703 case PyUnicode_4BYTE_KIND:
10704 iresult = ucs4lib_count(
10705 ((Py_UCS4*)buf1) + start, end - start,
10706 buf2, len2, PY_SSIZE_T_MAX
10707 );
10708 break;
10709 default:
10710 assert(0); iresult = 0;
10711 }
10712
10713 result = PyLong_FromSsize_t(iresult);
10714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 if (kind2 != kind)
10716 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717
10718 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010719
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 return result;
10721}
10722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010723PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010724 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010726Encode S using the codec registered for encoding. Default encoding\n\
10727is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010728handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010729a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10730'xmlcharrefreplace' as well as any other name registered with\n\
10731codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732
10733static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010734unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010736 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737 char *encoding = NULL;
10738 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010739
Benjamin Peterson308d6372009-09-18 21:42:35 +000010740 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10741 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010743 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010744}
10745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010746PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010747 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748\n\
10749Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010750If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751
10752static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010753unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010755 Py_ssize_t i, j, line_pos, src_len, incr;
10756 Py_UCS4 ch;
10757 PyObject *u;
10758 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010760 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010761 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762
10763 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010764 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765
Antoine Pitrou22425222011-10-04 19:10:51 +020010766 if (PyUnicode_READY(self) == -1)
10767 return NULL;
10768
Thomas Wouters7e474022000-07-16 12:04:32 +000010769 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010770 src_len = PyUnicode_GET_LENGTH(self);
10771 i = j = line_pos = 0;
10772 kind = PyUnicode_KIND(self);
10773 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010774 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010775 for (; i < src_len; i++) {
10776 ch = PyUnicode_READ(kind, src_data, i);
10777 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010778 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010779 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010780 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010781 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010782 goto overflow;
10783 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010784 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010785 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010789 goto overflow;
10790 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010792 if (ch == '\n' || ch == '\r')
10793 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010794 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010795 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010796 if (!found)
10797 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010798
Guido van Rossumd57fd912000-03-10 22:53:23 +000010799 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010800 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801 if (!u)
10802 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010803 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804
Antoine Pitroue71d5742011-10-04 15:55:09 +020010805 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806
Antoine Pitroue71d5742011-10-04 15:55:09 +020010807 for (; i < src_len; i++) {
10808 ch = PyUnicode_READ(kind, src_data, i);
10809 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010810 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010811 incr = tabsize - (line_pos % tabsize);
10812 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010813 FILL(kind, dest_data, ' ', j, incr);
10814 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010815 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010816 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010817 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010818 line_pos++;
10819 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010820 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010821 if (ch == '\n' || ch == '\r')
10822 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010824 }
10825 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010826 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010827
Antoine Pitroue71d5742011-10-04 15:55:09 +020010828 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010829 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10830 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831}
10832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010833PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010834 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835\n\
10836Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010837such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838arguments start and end are interpreted as in slice notation.\n\
10839\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010840Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
10842static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010845 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010846 Py_ssize_t start;
10847 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010848 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849
Jesus Ceaac451502011-04-20 17:09:23 +020010850 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10851 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 if (PyUnicode_READY(self) == -1)
10855 return NULL;
10856 if (PyUnicode_READY(substring) == -1)
10857 return NULL;
10858
Victor Stinner7931d9a2011-11-04 00:22:48 +010010859 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860
10861 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 if (result == -2)
10864 return NULL;
10865
Christian Heimes217cfd12007-12-02 14:31:20 +000010866 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867}
10868
10869static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010870unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010872 void *data;
10873 enum PyUnicode_Kind kind;
10874 Py_UCS4 ch;
10875 PyObject *res;
10876
10877 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10878 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010880 }
10881 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10882 PyErr_SetString(PyExc_IndexError, "string index out of range");
10883 return NULL;
10884 }
10885 kind = PyUnicode_KIND(self);
10886 data = PyUnicode_DATA(self);
10887 ch = PyUnicode_READ(kind, data, index);
10888 if (ch < 256)
10889 return get_latin1_char(ch);
10890
10891 res = PyUnicode_New(1, ch);
10892 if (res == NULL)
10893 return NULL;
10894 kind = PyUnicode_KIND(res);
10895 data = PyUnicode_DATA(res);
10896 PyUnicode_WRITE(kind, data, 0, ch);
10897 assert(_PyUnicode_CheckConsistency(res, 1));
10898 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010899}
10900
Guido van Rossumc2504932007-09-18 19:42:40 +000010901/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010902 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010903static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010904unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905{
Guido van Rossumc2504932007-09-18 19:42:40 +000010906 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080010907 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000010908
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010909#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050010910 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010911#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 if (_PyUnicode_HASH(self) != -1)
10913 return _PyUnicode_HASH(self);
10914 if (PyUnicode_READY(self) == -1)
10915 return -1;
10916 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010010917 /*
10918 We make the hash of the empty string be 0, rather than using
10919 (prefix ^ suffix), since this slightly obfuscates the hash secret
10920 */
10921 if (len == 0) {
10922 _PyUnicode_HASH(self) = 0;
10923 return 0;
10924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925
10926 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010010927#define HASH(P) \
10928 x ^= (Py_uhash_t) *P << 7; \
10929 while (--len >= 0) \
10930 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931
Georg Brandl2fb477c2012-02-21 00:33:36 +010010932 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 switch (PyUnicode_KIND(self)) {
10934 case PyUnicode_1BYTE_KIND: {
10935 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10936 HASH(c);
10937 break;
10938 }
10939 case PyUnicode_2BYTE_KIND: {
10940 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10941 HASH(s);
10942 break;
10943 }
10944 default: {
10945 Py_UCS4 *l;
10946 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10947 "Impossible switch case in unicode_hash");
10948 l = PyUnicode_4BYTE_DATA(self);
10949 HASH(l);
10950 break;
10951 }
10952 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010010953 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
10954 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955
Guido van Rossumc2504932007-09-18 19:42:40 +000010956 if (x == -1)
10957 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010959 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010963PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010964 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010966Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967
10968static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010971 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010972 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010973 Py_ssize_t start;
10974 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975
Jesus Ceaac451502011-04-20 17:09:23 +020010976 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10977 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 if (PyUnicode_READY(self) == -1)
10981 return NULL;
10982 if (PyUnicode_READY(substring) == -1)
10983 return NULL;
10984
Victor Stinner7931d9a2011-11-04 00:22:48 +010010985 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986
10987 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 if (result == -2)
10990 return NULL;
10991
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 if (result < 0) {
10993 PyErr_SetString(PyExc_ValueError, "substring not found");
10994 return NULL;
10995 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010996
Christian Heimes217cfd12007-12-02 14:31:20 +000010997 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998}
10999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011000PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011001 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011003Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011004at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005
11006static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011007unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 Py_ssize_t i, length;
11010 int kind;
11011 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012 int cased;
11013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 if (PyUnicode_READY(self) == -1)
11015 return NULL;
11016 length = PyUnicode_GET_LENGTH(self);
11017 kind = PyUnicode_KIND(self);
11018 data = PyUnicode_DATA(self);
11019
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 if (length == 1)
11022 return PyBool_FromLong(
11023 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011025 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011028
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 for (i = 0; i < length; i++) {
11031 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011032
Benjamin Peterson29060642009-01-31 22:14:21 +000011033 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11034 return PyBool_FromLong(0);
11035 else if (!cased && Py_UNICODE_ISLOWER(ch))
11036 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011038 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039}
11040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011041PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011042 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011044Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011045at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046
11047static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011048unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 Py_ssize_t i, length;
11051 int kind;
11052 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053 int cased;
11054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055 if (PyUnicode_READY(self) == -1)
11056 return NULL;
11057 length = PyUnicode_GET_LENGTH(self);
11058 kind = PyUnicode_KIND(self);
11059 data = PyUnicode_DATA(self);
11060
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062 if (length == 1)
11063 return PyBool_FromLong(
11064 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011066 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011068 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011069
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 for (i = 0; i < length; i++) {
11072 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011073
Benjamin Peterson29060642009-01-31 22:14:21 +000011074 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11075 return PyBool_FromLong(0);
11076 else if (!cased && Py_UNICODE_ISUPPER(ch))
11077 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011079 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080}
11081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011082PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011083 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011085Return True if S is a titlecased string and there is at least one\n\
11086character in S, i.e. upper- and titlecase characters may only\n\
11087follow uncased characters and lowercase characters only cased ones.\n\
11088Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011089
11090static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011091unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 Py_ssize_t i, length;
11094 int kind;
11095 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096 int cased, previous_is_cased;
11097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 if (PyUnicode_READY(self) == -1)
11099 return NULL;
11100 length = PyUnicode_GET_LENGTH(self);
11101 kind = PyUnicode_KIND(self);
11102 data = PyUnicode_DATA(self);
11103
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 if (length == 1) {
11106 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11107 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11108 (Py_UNICODE_ISUPPER(ch) != 0));
11109 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011111 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011113 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011114
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115 cased = 0;
11116 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011117 for (i = 0; i < length; i++) {
11118 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011119
Benjamin Peterson29060642009-01-31 22:14:21 +000011120 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11121 if (previous_is_cased)
11122 return PyBool_FromLong(0);
11123 previous_is_cased = 1;
11124 cased = 1;
11125 }
11126 else if (Py_UNICODE_ISLOWER(ch)) {
11127 if (!previous_is_cased)
11128 return PyBool_FromLong(0);
11129 previous_is_cased = 1;
11130 cased = 1;
11131 }
11132 else
11133 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011135 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136}
11137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011138PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011141Return True if all characters in S are whitespace\n\
11142and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143
11144static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011145unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 Py_ssize_t i, length;
11148 int kind;
11149 void *data;
11150
11151 if (PyUnicode_READY(self) == -1)
11152 return NULL;
11153 length = PyUnicode_GET_LENGTH(self);
11154 kind = PyUnicode_KIND(self);
11155 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 if (length == 1)
11159 return PyBool_FromLong(
11160 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011162 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011164 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 for (i = 0; i < length; i++) {
11167 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011168 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011169 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011171 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172}
11173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011174PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011175 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011176\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011177Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011178and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011179
11180static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011181unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011182{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 Py_ssize_t i, length;
11184 int kind;
11185 void *data;
11186
11187 if (PyUnicode_READY(self) == -1)
11188 return NULL;
11189 length = PyUnicode_GET_LENGTH(self);
11190 kind = PyUnicode_KIND(self);
11191 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011192
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011193 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 if (length == 1)
11195 return PyBool_FromLong(
11196 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011197
11198 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011200 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011202 for (i = 0; i < length; i++) {
11203 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011204 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011205 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011206 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011207}
11208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011209PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011210 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011211\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011212Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011213and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011214
11215static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011216unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011217{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 int kind;
11219 void *data;
11220 Py_ssize_t len, i;
11221
11222 if (PyUnicode_READY(self) == -1)
11223 return NULL;
11224
11225 kind = PyUnicode_KIND(self);
11226 data = PyUnicode_DATA(self);
11227 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011228
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011229 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 if (len == 1) {
11231 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11232 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11233 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011234
11235 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239 for (i = 0; i < len; i++) {
11240 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011241 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011242 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011243 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011244 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011245}
11246
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011247PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011250Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011251False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252
11253static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011254unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 Py_ssize_t i, length;
11257 int kind;
11258 void *data;
11259
11260 if (PyUnicode_READY(self) == -1)
11261 return NULL;
11262 length = PyUnicode_GET_LENGTH(self);
11263 kind = PyUnicode_KIND(self);
11264 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 if (length == 1)
11268 return PyBool_FromLong(
11269 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011271 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 for (i = 0; i < length; i++) {
11276 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011277 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011279 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280}
11281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011282PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011283 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011285Return True if all characters in S are digits\n\
11286and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287
11288static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011289unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 Py_ssize_t i, length;
11292 int kind;
11293 void *data;
11294
11295 if (PyUnicode_READY(self) == -1)
11296 return NULL;
11297 length = PyUnicode_GET_LENGTH(self);
11298 kind = PyUnicode_KIND(self);
11299 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 if (length == 1) {
11303 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11304 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011307 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011309 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 for (i = 0; i < length; i++) {
11312 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011313 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011315 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316}
11317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011318PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011319 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011321Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011322False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323
11324static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011325unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 Py_ssize_t i, length;
11328 int kind;
11329 void *data;
11330
11331 if (PyUnicode_READY(self) == -1)
11332 return NULL;
11333 length = PyUnicode_GET_LENGTH(self);
11334 kind = PyUnicode_KIND(self);
11335 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 if (length == 1)
11339 return PyBool_FromLong(
11340 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011342 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346 for (i = 0; i < length; i++) {
11347 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011348 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011350 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351}
11352
Martin v. Löwis47383402007-08-15 07:32:56 +000011353int
11354PyUnicode_IsIdentifier(PyObject *self)
11355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 int kind;
11357 void *data;
11358 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011359 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 if (PyUnicode_READY(self) == -1) {
11362 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011363 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 }
11365
11366 /* Special case for empty strings */
11367 if (PyUnicode_GET_LENGTH(self) == 0)
11368 return 0;
11369 kind = PyUnicode_KIND(self);
11370 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011371
11372 /* PEP 3131 says that the first character must be in
11373 XID_Start and subsequent characters in XID_Continue,
11374 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011375 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011376 letters, digits, underscore). However, given the current
11377 definition of XID_Start and XID_Continue, it is sufficient
11378 to check just for these, except that _ must be allowed
11379 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011381 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011382 return 0;
11383
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011384 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011385 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011386 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011387 return 1;
11388}
11389
11390PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011391 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011392\n\
11393Return True if S is a valid identifier according\n\
11394to the language definition.");
11395
11396static PyObject*
11397unicode_isidentifier(PyObject *self)
11398{
11399 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11400}
11401
Georg Brandl559e5d72008-06-11 18:37:52 +000011402PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011403 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011404\n\
11405Return True if all characters in S are considered\n\
11406printable in repr() or S is empty, False otherwise.");
11407
11408static PyObject*
11409unicode_isprintable(PyObject *self)
11410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 Py_ssize_t i, length;
11412 int kind;
11413 void *data;
11414
11415 if (PyUnicode_READY(self) == -1)
11416 return NULL;
11417 length = PyUnicode_GET_LENGTH(self);
11418 kind = PyUnicode_KIND(self);
11419 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011420
11421 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 if (length == 1)
11423 return PyBool_FromLong(
11424 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 for (i = 0; i < length; i++) {
11427 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011428 Py_RETURN_FALSE;
11429 }
11430 }
11431 Py_RETURN_TRUE;
11432}
11433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011434PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011435 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436\n\
11437Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011438iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
11440static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011441unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011443 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444}
11445
Martin v. Löwis18e16552006-02-15 17:27:45 +000011446static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011447unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 if (PyUnicode_READY(self) == -1)
11450 return -1;
11451 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452}
11453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011454PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011457Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011458done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459
11460static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011461unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011463 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 Py_UCS4 fillchar = ' ';
11465
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011466 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467 return NULL;
11468
Benjamin Petersonbac79492012-01-14 13:34:47 -050011469 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011470 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471
Victor Stinnerc4b49542011-12-11 22:44:26 +010011472 if (PyUnicode_GET_LENGTH(self) >= width)
11473 return unicode_result_unchanged(self);
11474
11475 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476}
11477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011478PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011481Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482
11483static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011484unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011486 if (PyUnicode_READY(self) == -1)
11487 return NULL;
11488 if (PyUnicode_IS_ASCII(self))
11489 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011490 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491}
11492
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011493#define LEFTSTRIP 0
11494#define RIGHTSTRIP 1
11495#define BOTHSTRIP 2
11496
11497/* Arrays indexed by above */
11498static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11499
11500#define STRIPNAME(i) (stripformat[i]+3)
11501
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011502/* externally visible for str.strip(unicode) */
11503PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011504_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011505{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 void *data;
11507 int kind;
11508 Py_ssize_t i, j, len;
11509 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11512 return NULL;
11513
11514 kind = PyUnicode_KIND(self);
11515 data = PyUnicode_DATA(self);
11516 len = PyUnicode_GET_LENGTH(self);
11517 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11518 PyUnicode_DATA(sepobj),
11519 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011520
Benjamin Peterson14339b62009-01-31 16:36:08 +000011521 i = 0;
11522 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 while (i < len &&
11524 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011525 i++;
11526 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011527 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011528
Benjamin Peterson14339b62009-01-31 16:36:08 +000011529 j = len;
11530 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 do {
11532 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 } while (j >= i &&
11534 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011535 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011536 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011537
Victor Stinner7931d9a2011-11-04 00:22:48 +010011538 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539}
11540
11541PyObject*
11542PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11543{
11544 unsigned char *data;
11545 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011546 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547
Victor Stinnerde636f32011-10-01 03:55:54 +020011548 if (PyUnicode_READY(self) == -1)
11549 return NULL;
11550
Victor Stinner684d5fd2012-05-03 02:32:34 +020011551 length = PyUnicode_GET_LENGTH(self);
11552 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011553
Victor Stinner684d5fd2012-05-03 02:32:34 +020011554 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011555 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556
Victor Stinnerde636f32011-10-01 03:55:54 +020011557 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011558 PyErr_SetString(PyExc_IndexError, "string index out of range");
11559 return NULL;
11560 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011561 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011562 Py_INCREF(unicode_empty);
11563 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011564 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011565
Victor Stinner684d5fd2012-05-03 02:32:34 +020011566 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011567 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011568 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011569 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011570 }
11571 else {
11572 kind = PyUnicode_KIND(self);
11573 data = PyUnicode_1BYTE_DATA(self);
11574 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011575 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011576 length);
11577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579
11580static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011581do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 int kind;
11584 void *data;
11585 Py_ssize_t len, i, j;
11586
11587 if (PyUnicode_READY(self) == -1)
11588 return NULL;
11589
11590 kind = PyUnicode_KIND(self);
11591 data = PyUnicode_DATA(self);
11592 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011593
Benjamin Peterson14339b62009-01-31 16:36:08 +000011594 i = 0;
11595 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011597 i++;
11598 }
11599 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011600
Benjamin Peterson14339b62009-01-31 16:36:08 +000011601 j = len;
11602 if (striptype != LEFTSTRIP) {
11603 do {
11604 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011606 j++;
11607 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011608
Victor Stinner7931d9a2011-11-04 00:22:48 +010011609 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610}
11611
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011612
11613static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011614do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011615{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011616 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011617
Benjamin Peterson14339b62009-01-31 16:36:08 +000011618 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11619 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011620
Benjamin Peterson14339b62009-01-31 16:36:08 +000011621 if (sep != NULL && sep != Py_None) {
11622 if (PyUnicode_Check(sep))
11623 return _PyUnicode_XStrip(self, striptype, sep);
11624 else {
11625 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 "%s arg must be None or str",
11627 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011628 return NULL;
11629 }
11630 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011631
Benjamin Peterson14339b62009-01-31 16:36:08 +000011632 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011633}
11634
11635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011636PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011638\n\
11639Return a copy of the string S with leading and trailing\n\
11640whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011641If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011642
11643static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011644unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011645{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011646 if (PyTuple_GET_SIZE(args) == 0)
11647 return do_strip(self, BOTHSTRIP); /* Common case */
11648 else
11649 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011650}
11651
11652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011653PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011654 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011655\n\
11656Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011657If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011658
11659static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011660unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011661{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011662 if (PyTuple_GET_SIZE(args) == 0)
11663 return do_strip(self, LEFTSTRIP); /* Common case */
11664 else
11665 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011666}
11667
11668
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011669PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011670 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011671\n\
11672Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011673If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011674
11675static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011676unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011677{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011678 if (PyTuple_GET_SIZE(args) == 0)
11679 return do_strip(self, RIGHTSTRIP); /* Common case */
11680 else
11681 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011682}
11683
11684
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011686unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011688 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690
Georg Brandl222de0f2009-04-12 12:01:50 +000011691 if (len < 1) {
11692 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011693 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695
Victor Stinnerc4b49542011-12-11 22:44:26 +010011696 /* no repeat, return original string */
11697 if (len == 1)
11698 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011699
Benjamin Petersonbac79492012-01-14 13:34:47 -050011700 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 return NULL;
11702
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011703 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011704 PyErr_SetString(PyExc_OverflowError,
11705 "repeated string is too long");
11706 return NULL;
11707 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011709
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011710 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711 if (!u)
11712 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011713 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 if (PyUnicode_GET_LENGTH(str) == 1) {
11716 const int kind = PyUnicode_KIND(str);
11717 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011718 if (kind == PyUnicode_1BYTE_KIND) {
11719 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011720 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011721 }
11722 else if (kind == PyUnicode_2BYTE_KIND) {
11723 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011724 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011725 ucs2[n] = fill_char;
11726 } else {
11727 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11728 assert(kind == PyUnicode_4BYTE_KIND);
11729 for (n = 0; n < len; ++n)
11730 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011731 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011732 }
11733 else {
11734 /* number of characters copied this far */
11735 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011736 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 char *to = (char *) PyUnicode_DATA(u);
11738 Py_MEMCPY(to, PyUnicode_DATA(str),
11739 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 n = (done <= nchars-done) ? done : nchars-done;
11742 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011743 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 }
11746
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011747 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011748 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749}
11750
Alexander Belopolsky40018472011-02-26 01:02:56 +000011751PyObject *
11752PyUnicode_Replace(PyObject *obj,
11753 PyObject *subobj,
11754 PyObject *replobj,
11755 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756{
11757 PyObject *self;
11758 PyObject *str1;
11759 PyObject *str2;
11760 PyObject *result;
11761
11762 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011763 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011766 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011767 Py_DECREF(self);
11768 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769 }
11770 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011771 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 Py_DECREF(self);
11773 Py_DECREF(str1);
11774 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011776 if (PyUnicode_READY(self) == -1 ||
11777 PyUnicode_READY(str1) == -1 ||
11778 PyUnicode_READY(str2) == -1)
11779 result = NULL;
11780 else
11781 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782 Py_DECREF(self);
11783 Py_DECREF(str1);
11784 Py_DECREF(str2);
11785 return result;
11786}
11787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011788PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011789 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790\n\
11791Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011792old replaced by new. If the optional argument count is\n\
11793given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794
11795static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 PyObject *str1;
11799 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011800 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801 PyObject *result;
11802
Martin v. Löwis18e16552006-02-15 17:27:45 +000011803 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011805 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011808 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 return NULL;
11810 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011811 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 Py_DECREF(str1);
11813 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011814 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011815 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11816 result = NULL;
11817 else
11818 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819
11820 Py_DECREF(str1);
11821 Py_DECREF(str2);
11822 return result;
11823}
11824
Alexander Belopolsky40018472011-02-26 01:02:56 +000011825static PyObject *
11826unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011828 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 Py_ssize_t isize;
11830 Py_ssize_t osize, squote, dquote, i, o;
11831 Py_UCS4 max, quote;
11832 int ikind, okind;
11833 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011836 return NULL;
11837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 isize = PyUnicode_GET_LENGTH(unicode);
11839 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 /* Compute length of output, quote characters, and
11842 maximum character */
11843 osize = 2; /* quotes */
11844 max = 127;
11845 squote = dquote = 0;
11846 ikind = PyUnicode_KIND(unicode);
11847 for (i = 0; i < isize; i++) {
11848 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11849 switch (ch) {
11850 case '\'': squote++; osize++; break;
11851 case '"': dquote++; osize++; break;
11852 case '\\': case '\t': case '\r': case '\n':
11853 osize += 2; break;
11854 default:
11855 /* Fast-path ASCII */
11856 if (ch < ' ' || ch == 0x7f)
11857 osize += 4; /* \xHH */
11858 else if (ch < 0x7f)
11859 osize++;
11860 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11861 osize++;
11862 max = ch > max ? ch : max;
11863 }
11864 else if (ch < 0x100)
11865 osize += 4; /* \xHH */
11866 else if (ch < 0x10000)
11867 osize += 6; /* \uHHHH */
11868 else
11869 osize += 10; /* \uHHHHHHHH */
11870 }
11871 }
11872
11873 quote = '\'';
11874 if (squote) {
11875 if (dquote)
11876 /* Both squote and dquote present. Use squote,
11877 and escape them */
11878 osize += squote;
11879 else
11880 quote = '"';
11881 }
11882
11883 repr = PyUnicode_New(osize, max);
11884 if (repr == NULL)
11885 return NULL;
11886 okind = PyUnicode_KIND(repr);
11887 odata = PyUnicode_DATA(repr);
11888
11889 PyUnicode_WRITE(okind, odata, 0, quote);
11890 PyUnicode_WRITE(okind, odata, osize-1, quote);
11891
11892 for (i = 0, o = 1; i < isize; i++) {
11893 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011894
11895 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 if ((ch == quote) || (ch == '\\')) {
11897 PyUnicode_WRITE(okind, odata, o++, '\\');
11898 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011899 continue;
11900 }
11901
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011903 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 PyUnicode_WRITE(okind, odata, o++, '\\');
11905 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011906 }
11907 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 PyUnicode_WRITE(okind, odata, o++, '\\');
11909 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011910 }
11911 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 PyUnicode_WRITE(okind, odata, o++, '\\');
11913 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011914 }
11915
11916 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011917 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 PyUnicode_WRITE(okind, odata, o++, '\\');
11919 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011920 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11921 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011922 }
11923
Georg Brandl559e5d72008-06-11 18:37:52 +000011924 /* Copy ASCII characters as-is */
11925 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011927 }
11928
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011930 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011931 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011932 (categories Z* and C* except ASCII space)
11933 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011935 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000011936 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011939 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11940 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011941 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011942 /* Map 16-bit characters to '\uxxxx' */
11943 else if (ch <= 0xffff) {
11944 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011945 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11946 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11947 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11948 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011949 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011950 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011951 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011952 PyUnicode_WRITE(okind, odata, o++, 'U');
11953 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11954 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11955 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11956 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020011957 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11958 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11959 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11960 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011961 }
11962 }
11963 /* Copy characters as-is */
11964 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011966 }
11967 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011970 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011971 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972}
11973
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011974PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011975 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976\n\
11977Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011978such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979arguments start and end are interpreted as in slice notation.\n\
11980\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011981Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982
11983static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011986 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011987 Py_ssize_t start;
11988 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011989 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990
Jesus Ceaac451502011-04-20 17:09:23 +020011991 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11992 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011993 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 if (PyUnicode_READY(self) == -1)
11996 return NULL;
11997 if (PyUnicode_READY(substring) == -1)
11998 return NULL;
11999
Victor Stinner7931d9a2011-11-04 00:22:48 +010012000 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001
12002 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 if (result == -2)
12005 return NULL;
12006
Christian Heimes217cfd12007-12-02 14:31:20 +000012007 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008}
12009
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012010PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012011 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012013Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014
12015static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012018 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012019 Py_ssize_t start;
12020 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012021 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022
Jesus Ceaac451502011-04-20 17:09:23 +020012023 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12024 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 if (PyUnicode_READY(self) == -1)
12028 return NULL;
12029 if (PyUnicode_READY(substring) == -1)
12030 return NULL;
12031
Victor Stinner7931d9a2011-11-04 00:22:48 +010012032 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033
12034 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 if (result == -2)
12037 return NULL;
12038
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039 if (result < 0) {
12040 PyErr_SetString(PyExc_ValueError, "substring not found");
12041 return NULL;
12042 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043
Christian Heimes217cfd12007-12-02 14:31:20 +000012044 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045}
12046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012047PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012048 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012050Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012051done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052
12053static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012054unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012056 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 Py_UCS4 fillchar = ' ';
12058
Victor Stinnere9a29352011-10-01 02:14:59 +020012059 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012061
Benjamin Petersonbac79492012-01-14 13:34:47 -050012062 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063 return NULL;
12064
Victor Stinnerc4b49542011-12-11 22:44:26 +010012065 if (PyUnicode_GET_LENGTH(self) >= width)
12066 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067
Victor Stinnerc4b49542011-12-11 22:44:26 +010012068 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069}
12070
Alexander Belopolsky40018472011-02-26 01:02:56 +000012071PyObject *
12072PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073{
12074 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012075
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076 s = PyUnicode_FromObject(s);
12077 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012078 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012079 if (sep != NULL) {
12080 sep = PyUnicode_FromObject(sep);
12081 if (sep == NULL) {
12082 Py_DECREF(s);
12083 return NULL;
12084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085 }
12086
Victor Stinner9310abb2011-10-05 00:59:23 +020012087 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
12089 Py_DECREF(s);
12090 Py_XDECREF(sep);
12091 return result;
12092}
12093
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012094PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012095 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096\n\
12097Return a list of the words in S, using sep as the\n\
12098delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012099splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012100whitespace string is a separator and empty strings are\n\
12101removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102
12103static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012104unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012106 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012108 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012110 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12111 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112 return NULL;
12113
12114 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012115 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012117 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012119 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120}
12121
Thomas Wouters477c8d52006-05-27 19:21:47 +000012122PyObject *
12123PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12124{
12125 PyObject* str_obj;
12126 PyObject* sep_obj;
12127 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 int kind1, kind2, kind;
12129 void *buf1 = NULL, *buf2 = NULL;
12130 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012131
12132 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012133 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012135 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012136 if (!sep_obj) {
12137 Py_DECREF(str_obj);
12138 return NULL;
12139 }
12140 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12141 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012142 Py_DECREF(str_obj);
12143 return NULL;
12144 }
12145
Victor Stinner14f8f022011-10-05 20:58:25 +020012146 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012147 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012148 kind = Py_MAX(kind1, kind2);
12149 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012151 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 if (!buf1)
12153 goto onError;
12154 buf2 = PyUnicode_DATA(sep_obj);
12155 if (kind2 != kind)
12156 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12157 if (!buf2)
12158 goto onError;
12159 len1 = PyUnicode_GET_LENGTH(str_obj);
12160 len2 = PyUnicode_GET_LENGTH(sep_obj);
12161
Benjamin Petersonead6b532011-12-20 17:23:42 -060012162 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012164 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12165 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12166 else
12167 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168 break;
12169 case PyUnicode_2BYTE_KIND:
12170 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12171 break;
12172 case PyUnicode_4BYTE_KIND:
12173 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12174 break;
12175 default:
12176 assert(0);
12177 out = 0;
12178 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012179
12180 Py_DECREF(sep_obj);
12181 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 if (kind1 != kind)
12183 PyMem_Free(buf1);
12184 if (kind2 != kind)
12185 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012186
12187 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 onError:
12189 Py_DECREF(sep_obj);
12190 Py_DECREF(str_obj);
12191 if (kind1 != kind && buf1)
12192 PyMem_Free(buf1);
12193 if (kind2 != kind && buf2)
12194 PyMem_Free(buf2);
12195 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012196}
12197
12198
12199PyObject *
12200PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12201{
12202 PyObject* str_obj;
12203 PyObject* sep_obj;
12204 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 int kind1, kind2, kind;
12206 void *buf1 = NULL, *buf2 = NULL;
12207 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012208
12209 str_obj = PyUnicode_FromObject(str_in);
12210 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012211 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012212 sep_obj = PyUnicode_FromObject(sep_in);
12213 if (!sep_obj) {
12214 Py_DECREF(str_obj);
12215 return NULL;
12216 }
12217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 kind1 = PyUnicode_KIND(str_in);
12219 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012220 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 buf1 = PyUnicode_DATA(str_in);
12222 if (kind1 != kind)
12223 buf1 = _PyUnicode_AsKind(str_in, kind);
12224 if (!buf1)
12225 goto onError;
12226 buf2 = PyUnicode_DATA(sep_obj);
12227 if (kind2 != kind)
12228 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12229 if (!buf2)
12230 goto onError;
12231 len1 = PyUnicode_GET_LENGTH(str_obj);
12232 len2 = PyUnicode_GET_LENGTH(sep_obj);
12233
Benjamin Petersonead6b532011-12-20 17:23:42 -060012234 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012236 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12237 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12238 else
12239 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 break;
12241 case PyUnicode_2BYTE_KIND:
12242 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12243 break;
12244 case PyUnicode_4BYTE_KIND:
12245 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12246 break;
12247 default:
12248 assert(0);
12249 out = 0;
12250 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012251
12252 Py_DECREF(sep_obj);
12253 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 if (kind1 != kind)
12255 PyMem_Free(buf1);
12256 if (kind2 != kind)
12257 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012258
12259 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012260 onError:
12261 Py_DECREF(sep_obj);
12262 Py_DECREF(str_obj);
12263 if (kind1 != kind && buf1)
12264 PyMem_Free(buf1);
12265 if (kind2 != kind && buf2)
12266 PyMem_Free(buf2);
12267 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012268}
12269
12270PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012271 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012272\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012273Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012274the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012275found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012276
12277static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012278unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012279{
Victor Stinner9310abb2011-10-05 00:59:23 +020012280 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012281}
12282
12283PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012284 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012285\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012286Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012287the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012288separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012289
12290static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012291unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012292{
Victor Stinner9310abb2011-10-05 00:59:23 +020012293 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012294}
12295
Alexander Belopolsky40018472011-02-26 01:02:56 +000012296PyObject *
12297PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012298{
12299 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012300
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012301 s = PyUnicode_FromObject(s);
12302 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012303 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 if (sep != NULL) {
12305 sep = PyUnicode_FromObject(sep);
12306 if (sep == NULL) {
12307 Py_DECREF(s);
12308 return NULL;
12309 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012310 }
12311
Victor Stinner9310abb2011-10-05 00:59:23 +020012312 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012313
12314 Py_DECREF(s);
12315 Py_XDECREF(sep);
12316 return result;
12317}
12318
12319PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012320 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012321\n\
12322Return a list of the words in S, using sep as the\n\
12323delimiter string, starting at the end of the string and\n\
12324working to the front. If maxsplit is given, at most maxsplit\n\
12325splits are done. If sep is not specified, any whitespace string\n\
12326is a separator.");
12327
12328static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012329unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012330{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012331 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012332 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012333 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012334
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012335 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12336 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012337 return NULL;
12338
12339 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012340 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012341 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012342 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012343 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012344 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012345}
12346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012347PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349\n\
12350Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012351Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012352is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353
12354static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012355unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012357 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012358 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012360 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12361 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362 return NULL;
12363
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012364 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365}
12366
12367static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012368PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012370 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371}
12372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012373PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012374 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375\n\
12376Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012377and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378
12379static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012380unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012382 if (PyUnicode_READY(self) == -1)
12383 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012384 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385}
12386
Georg Brandlceee0772007-11-27 23:48:05 +000012387PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012389\n\
12390Return a translation table usable for str.translate().\n\
12391If there is only one argument, it must be a dictionary mapping Unicode\n\
12392ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012393Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012394If there are two arguments, they must be strings of equal length, and\n\
12395in the resulting dictionary, each character in x will be mapped to the\n\
12396character at the same position in y. If there is a third argument, it\n\
12397must be a string, whose characters will be mapped to None in the result.");
12398
12399static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012400unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012401{
12402 PyObject *x, *y = NULL, *z = NULL;
12403 PyObject *new = NULL, *key, *value;
12404 Py_ssize_t i = 0;
12405 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012406
Georg Brandlceee0772007-11-27 23:48:05 +000012407 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12408 return NULL;
12409 new = PyDict_New();
12410 if (!new)
12411 return NULL;
12412 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 int x_kind, y_kind, z_kind;
12414 void *x_data, *y_data, *z_data;
12415
Georg Brandlceee0772007-11-27 23:48:05 +000012416 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012417 if (!PyUnicode_Check(x)) {
12418 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12419 "be a string if there is a second argument");
12420 goto err;
12421 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012423 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12424 "arguments must have equal length");
12425 goto err;
12426 }
12427 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 x_kind = PyUnicode_KIND(x);
12429 y_kind = PyUnicode_KIND(y);
12430 x_data = PyUnicode_DATA(x);
12431 y_data = PyUnicode_DATA(y);
12432 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12433 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012434 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012435 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012436 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012437 if (!value) {
12438 Py_DECREF(key);
12439 goto err;
12440 }
Georg Brandlceee0772007-11-27 23:48:05 +000012441 res = PyDict_SetItem(new, key, value);
12442 Py_DECREF(key);
12443 Py_DECREF(value);
12444 if (res < 0)
12445 goto err;
12446 }
12447 /* create entries for deleting chars in z */
12448 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449 z_kind = PyUnicode_KIND(z);
12450 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012451 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012453 if (!key)
12454 goto err;
12455 res = PyDict_SetItem(new, key, Py_None);
12456 Py_DECREF(key);
12457 if (res < 0)
12458 goto err;
12459 }
12460 }
12461 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 int kind;
12463 void *data;
12464
Georg Brandlceee0772007-11-27 23:48:05 +000012465 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012466 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012467 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12468 "to maketrans it must be a dict");
12469 goto err;
12470 }
12471 /* copy entries into the new dict, converting string keys to int keys */
12472 while (PyDict_Next(x, &i, &key, &value)) {
12473 if (PyUnicode_Check(key)) {
12474 /* convert string keys to integer keys */
12475 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012476 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012477 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12478 "table must be of length 1");
12479 goto err;
12480 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012481 kind = PyUnicode_KIND(key);
12482 data = PyUnicode_DATA(key);
12483 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012484 if (!newkey)
12485 goto err;
12486 res = PyDict_SetItem(new, newkey, value);
12487 Py_DECREF(newkey);
12488 if (res < 0)
12489 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012490 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012491 /* just keep integer keys */
12492 if (PyDict_SetItem(new, key, value) < 0)
12493 goto err;
12494 } else {
12495 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12496 "be strings or integers");
12497 goto err;
12498 }
12499 }
12500 }
12501 return new;
12502 err:
12503 Py_DECREF(new);
12504 return NULL;
12505}
12506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012507PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012508 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509\n\
12510Return a copy of the string S, where all characters have been mapped\n\
12511through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012512Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012513Unmapped characters are left untouched. Characters mapped to None\n\
12514are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
12516static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520}
12521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012522PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012523 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012525Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526
12527static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012528unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012530 if (PyUnicode_READY(self) == -1)
12531 return NULL;
12532 if (PyUnicode_IS_ASCII(self))
12533 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012534 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535}
12536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012537PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012538 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012540Pad a numeric string S with zeros on the left, to fill a field\n\
12541of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542
12543static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012544unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012546 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012547 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012548 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 int kind;
12550 void *data;
12551 Py_UCS4 chr;
12552
Martin v. Löwis18e16552006-02-15 17:27:45 +000012553 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554 return NULL;
12555
Benjamin Petersonbac79492012-01-14 13:34:47 -050012556 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558
Victor Stinnerc4b49542011-12-11 22:44:26 +010012559 if (PyUnicode_GET_LENGTH(self) >= width)
12560 return unicode_result_unchanged(self);
12561
12562 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563
12564 u = pad(self, fill, 0, '0');
12565
Walter Dörwald068325e2002-04-15 13:36:47 +000012566 if (u == NULL)
12567 return NULL;
12568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 kind = PyUnicode_KIND(u);
12570 data = PyUnicode_DATA(u);
12571 chr = PyUnicode_READ(kind, data, fill);
12572
12573 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 PyUnicode_WRITE(kind, data, 0, chr);
12576 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577 }
12578
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012579 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012580 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582
12583#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012584static PyObject *
12585unicode__decimal2ascii(PyObject *self)
12586{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012588}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589#endif
12590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012591PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012594Return True if S starts with the specified prefix, False otherwise.\n\
12595With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012596With optional end, stop comparing S at that position.\n\
12597prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598
12599static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012600unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012601 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012603 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012604 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012605 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012606 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012607 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608
Jesus Ceaac451502011-04-20 17:09:23 +020012609 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012611 if (PyTuple_Check(subobj)) {
12612 Py_ssize_t i;
12613 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012614 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012615 if (substring == NULL)
12616 return NULL;
12617 result = tailmatch(self, substring, start, end, -1);
12618 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012619 if (result == -1)
12620 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012621 if (result) {
12622 Py_RETURN_TRUE;
12623 }
12624 }
12625 /* nothing matched */
12626 Py_RETURN_FALSE;
12627 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012628 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012629 if (substring == NULL) {
12630 if (PyErr_ExceptionMatches(PyExc_TypeError))
12631 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12632 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012633 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012634 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012635 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012637 if (result == -1)
12638 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012639 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640}
12641
12642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012643PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012644 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012646Return True if S ends with the specified suffix, False otherwise.\n\
12647With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012648With optional end, stop comparing S at that position.\n\
12649suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650
12651static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012652unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012653 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012655 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012656 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012657 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012658 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012659 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660
Jesus Ceaac451502011-04-20 17:09:23 +020012661 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012662 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012663 if (PyTuple_Check(subobj)) {
12664 Py_ssize_t i;
12665 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012666 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012667 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012668 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012669 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012670 result = tailmatch(self, substring, start, end, +1);
12671 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012672 if (result == -1)
12673 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012674 if (result) {
12675 Py_RETURN_TRUE;
12676 }
12677 }
12678 Py_RETURN_FALSE;
12679 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012680 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012681 if (substring == NULL) {
12682 if (PyErr_ExceptionMatches(PyExc_TypeError))
12683 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12684 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012685 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012686 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012687 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010012688 if (result == -1)
12689 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012691 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692}
12693
Victor Stinner202fdca2012-05-07 12:47:02 +020012694Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012695_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012696{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012697 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012698 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12699 writer->data = PyUnicode_DATA(writer->buffer);
12700 writer->kind = PyUnicode_KIND(writer->buffer);
12701}
12702
Victor Stinnerd3f08822012-05-29 12:57:52 +020012703void
12704_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012705{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012706 memset(writer, 0, sizeof(*writer));
12707#ifdef Py_DEBUG
12708 writer->kind = 5; /* invalid kind */
12709#endif
12710 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012711 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012712}
12713
Victor Stinnerd3f08822012-05-29 12:57:52 +020012714int
12715_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12716 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012717{
12718 Py_ssize_t newlen;
12719 PyObject *newbuffer;
12720
Victor Stinnerd3f08822012-05-29 12:57:52 +020012721 assert(length > 0);
12722
Victor Stinner202fdca2012-05-07 12:47:02 +020012723 if (length > PY_SSIZE_T_MAX - writer->pos) {
12724 PyErr_NoMemory();
12725 return -1;
12726 }
12727 newlen = writer->pos + length;
12728
Victor Stinnerd3f08822012-05-29 12:57:52 +020012729 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012730 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012731 /* overallocate 25% to limit the number of resize */
12732 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12733 newlen += newlen / 4;
12734 if (newlen < writer->min_length)
12735 newlen = writer->min_length;
12736 }
12737 writer->buffer = PyUnicode_New(newlen, maxchar);
12738 if (writer->buffer == NULL)
12739 return -1;
12740 _PyUnicodeWriter_Update(writer);
12741 return 0;
12742 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012743
Victor Stinnerd3f08822012-05-29 12:57:52 +020012744 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012745 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012746 /* overallocate 25% to limit the number of resize */
12747 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12748 newlen += newlen / 4;
12749 if (newlen < writer->min_length)
12750 newlen = writer->min_length;
12751 }
12752
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012753 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012754 /* resize + widen */
12755 newbuffer = PyUnicode_New(newlen, maxchar);
12756 if (newbuffer == NULL)
12757 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012758 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12759 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012760 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012761 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012762 }
12763 else {
12764 newbuffer = resize_compact(writer->buffer, newlen);
12765 if (newbuffer == NULL)
12766 return -1;
12767 }
12768 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012769 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012770 }
12771 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012772 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012773 newbuffer = PyUnicode_New(writer->size, maxchar);
12774 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012775 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012776 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12777 writer->buffer, 0, writer->pos);
12778 Py_DECREF(writer->buffer);
12779 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012780 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012781 }
12782 return 0;
12783}
12784
Victor Stinnerd3f08822012-05-29 12:57:52 +020012785int
12786_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12787{
12788 Py_UCS4 maxchar;
12789 Py_ssize_t len;
12790
12791 if (PyUnicode_READY(str) == -1)
12792 return -1;
12793 len = PyUnicode_GET_LENGTH(str);
12794 if (len == 0)
12795 return 0;
12796 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12797 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012798 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012799 Py_INCREF(str);
12800 writer->buffer = str;
12801 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012802 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012803 writer->size = 0;
12804 writer->pos += len;
12805 return 0;
12806 }
12807 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12808 return -1;
12809 }
12810 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12811 str, 0, len);
12812 writer->pos += len;
12813 return 0;
12814}
12815
Victor Stinnere215d962012-10-06 23:03:36 +020012816int
12817_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12818{
12819 Py_UCS4 maxchar;
12820
12821 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12822 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12823 return -1;
12824 unicode_write_cstr(writer->buffer, writer->pos, str, len);
12825 writer->pos += len;
12826 return 0;
12827}
12828
Victor Stinnerd3f08822012-05-29 12:57:52 +020012829PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012830_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012831{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012832 if (writer->pos == 0) {
12833 Py_XDECREF(writer->buffer);
12834 Py_INCREF(unicode_empty);
12835 return unicode_empty;
12836 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012837 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012838 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12839 return writer->buffer;
12840 }
12841 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12842 PyObject *newbuffer;
12843 newbuffer = resize_compact(writer->buffer, writer->pos);
12844 if (newbuffer == NULL) {
12845 Py_DECREF(writer->buffer);
12846 return NULL;
12847 }
12848 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012849 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012850 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012851 return writer->buffer;
12852}
12853
Victor Stinnerd3f08822012-05-29 12:57:52 +020012854void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012855_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012856{
12857 Py_CLEAR(writer->buffer);
12858}
12859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012860#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012861
12862PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012863 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012864\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012865Return a formatted version of S, using substitutions from args and kwargs.\n\
12866The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012867
Eric Smith27bbca62010-11-04 17:06:58 +000012868PyDoc_STRVAR(format_map__doc__,
12869 "S.format_map(mapping) -> str\n\
12870\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012871Return a formatted version of S, using substitutions from mapping.\n\
12872The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012873
Eric Smith4a7d76d2008-05-30 18:10:19 +000012874static PyObject *
12875unicode__format__(PyObject* self, PyObject* args)
12876{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012877 PyObject *format_spec;
12878 _PyUnicodeWriter writer;
12879 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012880
12881 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12882 return NULL;
12883
Victor Stinnerd3f08822012-05-29 12:57:52 +020012884 if (PyUnicode_READY(self) == -1)
12885 return NULL;
12886 _PyUnicodeWriter_Init(&writer, 0);
12887 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12888 self, format_spec, 0,
12889 PyUnicode_GET_LENGTH(format_spec));
12890 if (ret == -1) {
12891 _PyUnicodeWriter_Dealloc(&writer);
12892 return NULL;
12893 }
12894 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012895}
12896
Eric Smith8c663262007-08-25 02:26:07 +000012897PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012898 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012899\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012900Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012901
12902static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012903unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012905 Py_ssize_t size;
12906
12907 /* If it's a compact object, account for base structure +
12908 character data. */
12909 if (PyUnicode_IS_COMPACT_ASCII(v))
12910 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12911 else if (PyUnicode_IS_COMPACT(v))
12912 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012913 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914 else {
12915 /* If it is a two-block object, account for base object, and
12916 for character block if present. */
12917 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012918 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012920 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012921 }
12922 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012923 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012924 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012926 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012927 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928
12929 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012930}
12931
12932PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012933 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012934
12935static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012936unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012937{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012938 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 if (!copy)
12940 return NULL;
12941 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012942}
12943
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012945 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012946 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012947 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12948 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012949 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12950 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050012951 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012952 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12953 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12954 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12955 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12956 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012957 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012958 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12959 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12960 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012961 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012962 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12963 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12964 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012965 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012966 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012967 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012968 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012969 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12970 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12971 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12972 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12973 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12974 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12975 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12976 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12977 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12978 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12979 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12980 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12981 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12982 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012983 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012984 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012985 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012986 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012987 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012988 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012989 {"maketrans", (PyCFunction) unicode_maketrans,
12990 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012991 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012992#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012993 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012994 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012995#endif
12996
Benjamin Peterson14339b62009-01-31 16:36:08 +000012997 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012998 {NULL, NULL}
12999};
13000
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013001static PyObject *
13002unicode_mod(PyObject *v, PyObject *w)
13003{
Brian Curtindfc80e32011-08-10 20:28:54 -050013004 if (!PyUnicode_Check(v))
13005 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013006 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013007}
13008
13009static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013010 0, /*nb_add*/
13011 0, /*nb_subtract*/
13012 0, /*nb_multiply*/
13013 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013014};
13015
Guido van Rossumd57fd912000-03-10 22:53:23 +000013016static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013017 (lenfunc) unicode_length, /* sq_length */
13018 PyUnicode_Concat, /* sq_concat */
13019 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13020 (ssizeargfunc) unicode_getitem, /* sq_item */
13021 0, /* sq_slice */
13022 0, /* sq_ass_item */
13023 0, /* sq_ass_slice */
13024 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013025};
13026
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013027static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013028unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 if (PyUnicode_READY(self) == -1)
13031 return NULL;
13032
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013033 if (PyIndex_Check(item)) {
13034 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013035 if (i == -1 && PyErr_Occurred())
13036 return NULL;
13037 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013039 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013040 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013041 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013042 PyObject *result;
13043 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013044 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013045 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013047 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013048 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013049 return NULL;
13050 }
13051
13052 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013053 Py_INCREF(unicode_empty);
13054 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013055 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013056 slicelength == PyUnicode_GET_LENGTH(self)) {
13057 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013058 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013059 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013060 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013061 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013062 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013063 src_kind = PyUnicode_KIND(self);
13064 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013065 if (!PyUnicode_IS_ASCII(self)) {
13066 kind_limit = kind_maxchar_limit(src_kind);
13067 max_char = 0;
13068 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13069 ch = PyUnicode_READ(src_kind, src_data, cur);
13070 if (ch > max_char) {
13071 max_char = ch;
13072 if (max_char >= kind_limit)
13073 break;
13074 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013075 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013076 }
Victor Stinner55c99112011-10-13 01:17:06 +020013077 else
13078 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013079 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013080 if (result == NULL)
13081 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013082 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013083 dest_data = PyUnicode_DATA(result);
13084
13085 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013086 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13087 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013088 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013089 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013090 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013091 } else {
13092 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13093 return NULL;
13094 }
13095}
13096
13097static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013098 (lenfunc)unicode_length, /* mp_length */
13099 (binaryfunc)unicode_subscript, /* mp_subscript */
13100 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013101};
13102
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104/* Helpers for PyUnicode_Format() */
13105
Victor Stinnera47082312012-10-04 02:19:54 +020013106struct unicode_formatter_t {
13107 PyObject *args;
13108 int args_owned;
13109 Py_ssize_t arglen, argidx;
13110 PyObject *dict;
13111
13112 enum PyUnicode_Kind fmtkind;
13113 Py_ssize_t fmtcnt, fmtpos;
13114 void *fmtdata;
13115 PyObject *fmtstr;
13116
13117 _PyUnicodeWriter writer;
13118};
13119
13120struct unicode_format_arg_t {
13121 Py_UCS4 ch;
13122 int flags;
13123 Py_ssize_t width;
13124 int prec;
13125 int sign;
13126};
13127
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013129unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130{
Victor Stinnera47082312012-10-04 02:19:54 +020013131 Py_ssize_t argidx = ctx->argidx;
13132
13133 if (argidx < ctx->arglen) {
13134 ctx->argidx++;
13135 if (ctx->arglen < 0)
13136 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013137 else
Victor Stinnera47082312012-10-04 02:19:54 +020013138 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139 }
13140 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142 return NULL;
13143}
13144
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013145/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146
Victor Stinnera47082312012-10-04 02:19:54 +020013147/* Format a float into the writer if the writer is not NULL, or into *p_output
13148 otherwise.
13149
13150 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013151static int
Victor Stinnera47082312012-10-04 02:19:54 +020013152formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13153 PyObject **p_output,
13154 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013156 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013158 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013159 int prec;
13160 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013161
Guido van Rossumd57fd912000-03-10 22:53:23 +000013162 x = PyFloat_AsDouble(v);
13163 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013164 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013165
Victor Stinnera47082312012-10-04 02:19:54 +020013166 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013168 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013169
Victor Stinnera47082312012-10-04 02:19:54 +020013170 if (arg->flags & F_ALT)
13171 dtoa_flags = Py_DTSF_ALT;
13172 else
13173 dtoa_flags = 0;
13174 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013175 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013176 return -1;
13177 len = strlen(p);
13178 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013179 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13180 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013181 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013182 }
Victor Stinner184252a2012-06-16 02:57:41 +020013183 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013184 writer->pos += len;
13185 }
13186 else
13187 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013188 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013189 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190}
13191
Victor Stinnerd0880d52012-04-27 23:40:13 +020013192/* formatlong() emulates the format codes d, u, o, x and X, and
13193 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13194 * Python's regular ints.
13195 * Return value: a new PyUnicodeObject*, or NULL if error.
13196 * The output string is of the form
13197 * "-"? ("0x" | "0X")? digit+
13198 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13199 * set in flags. The case of hex digits will be correct,
13200 * There will be at least prec digits, zero-filled on the left if
13201 * necessary to get that many.
13202 * val object to be converted
13203 * flags bitmask of format flags; only F_ALT is looked at
13204 * prec minimum number of digits; 0-fill on left if needed
13205 * type a character in [duoxX]; u acts the same as d
13206 *
13207 * CAUTION: o, x and X conversions on regular ints can never
13208 * produce a '-' sign, but can for Python's unbounded ints.
13209 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013210static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013211formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013212{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013213 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013214 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013215 Py_ssize_t i;
13216 int sign; /* 1 if '-', else 0 */
13217 int len; /* number of characters */
13218 Py_ssize_t llen;
13219 int numdigits; /* len == numnondigits + numdigits */
13220 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013221 int prec = arg->prec;
13222 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013223
Victor Stinnerd0880d52012-04-27 23:40:13 +020013224 /* Avoid exceeding SSIZE_T_MAX */
13225 if (prec > INT_MAX-3) {
13226 PyErr_SetString(PyExc_OverflowError,
13227 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013228 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013229 }
13230
13231 assert(PyLong_Check(val));
13232
13233 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013234 default:
13235 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013236 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013237 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013238 case 'u':
13239 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013240 if (PyBool_Check(val))
13241 result = PyNumber_ToBase(val, 10);
13242 else
13243 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013244 break;
13245 case 'o':
13246 numnondigits = 2;
13247 result = PyNumber_ToBase(val, 8);
13248 break;
13249 case 'x':
13250 case 'X':
13251 numnondigits = 2;
13252 result = PyNumber_ToBase(val, 16);
13253 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013254 }
13255 if (!result)
13256 return NULL;
13257
13258 assert(unicode_modifiable(result));
13259 assert(PyUnicode_IS_READY(result));
13260 assert(PyUnicode_IS_ASCII(result));
13261
13262 /* To modify the string in-place, there can only be one reference. */
13263 if (Py_REFCNT(result) != 1) {
13264 PyErr_BadInternalCall();
13265 return NULL;
13266 }
13267 buf = PyUnicode_DATA(result);
13268 llen = PyUnicode_GET_LENGTH(result);
13269 if (llen > INT_MAX) {
13270 PyErr_SetString(PyExc_ValueError,
13271 "string too large in _PyBytes_FormatLong");
13272 return NULL;
13273 }
13274 len = (int)llen;
13275 sign = buf[0] == '-';
13276 numnondigits += sign;
13277 numdigits = len - numnondigits;
13278 assert(numdigits > 0);
13279
13280 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013281 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013282 (type == 'o' || type == 'x' || type == 'X'))) {
13283 assert(buf[sign] == '0');
13284 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13285 buf[sign+1] == 'o');
13286 numnondigits -= 2;
13287 buf += 2;
13288 len -= 2;
13289 if (sign)
13290 buf[0] = '-';
13291 assert(len == numnondigits + numdigits);
13292 assert(numdigits > 0);
13293 }
13294
13295 /* Fill with leading zeroes to meet minimum width. */
13296 if (prec > numdigits) {
13297 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13298 numnondigits + prec);
13299 char *b1;
13300 if (!r1) {
13301 Py_DECREF(result);
13302 return NULL;
13303 }
13304 b1 = PyBytes_AS_STRING(r1);
13305 for (i = 0; i < numnondigits; ++i)
13306 *b1++ = *buf++;
13307 for (i = 0; i < prec - numdigits; i++)
13308 *b1++ = '0';
13309 for (i = 0; i < numdigits; i++)
13310 *b1++ = *buf++;
13311 *b1 = '\0';
13312 Py_DECREF(result);
13313 result = r1;
13314 buf = PyBytes_AS_STRING(result);
13315 len = numnondigits + prec;
13316 }
13317
13318 /* Fix up case for hex conversions. */
13319 if (type == 'X') {
13320 /* Need to convert all lower case letters to upper case.
13321 and need to convert 0x to 0X (and -0x to -0X). */
13322 for (i = 0; i < len; i++)
13323 if (buf[i] >= 'a' && buf[i] <= 'x')
13324 buf[i] -= 'a'-'A';
13325 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013326 if (!PyUnicode_Check(result)
13327 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013328 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013329 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013330 Py_DECREF(result);
13331 result = unicode;
13332 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013333 else if (len != PyUnicode_GET_LENGTH(result)) {
13334 if (PyUnicode_Resize(&result, len) < 0)
13335 Py_CLEAR(result);
13336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013337 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013338}
13339
Victor Stinner621ef3d2012-10-02 00:33:47 +020013340/* Format an integer.
13341 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013342 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013343 * -1 and raise an exception on error */
13344static int
Victor Stinnera47082312012-10-04 02:19:54 +020013345mainformatlong(PyObject *v,
13346 struct unicode_format_arg_t *arg,
13347 PyObject **p_output,
13348 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013349{
13350 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013351 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013352
13353 if (!PyNumber_Check(v))
13354 goto wrongtype;
13355
13356 if (!PyLong_Check(v)) {
13357 iobj = PyNumber_Long(v);
13358 if (iobj == NULL) {
13359 if (PyErr_ExceptionMatches(PyExc_TypeError))
13360 goto wrongtype;
13361 return -1;
13362 }
13363 assert(PyLong_Check(iobj));
13364 }
13365 else {
13366 iobj = v;
13367 Py_INCREF(iobj);
13368 }
13369
13370 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013371 && arg->width == -1 && arg->prec == -1
13372 && !(arg->flags & (F_SIGN | F_BLANK))
13373 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013374 {
13375 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013376 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013377 int base;
13378
Victor Stinnera47082312012-10-04 02:19:54 +020013379 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013380 {
13381 default:
13382 assert(0 && "'type' not in [diuoxX]");
13383 case 'd':
13384 case 'i':
13385 case 'u':
13386 base = 10;
13387 break;
13388 case 'o':
13389 base = 8;
13390 break;
13391 case 'x':
13392 case 'X':
13393 base = 16;
13394 break;
13395 }
13396
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013397 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13398 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013399 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013400 }
13401 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013402 return 1;
13403 }
13404
Victor Stinnera47082312012-10-04 02:19:54 +020013405 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013406 Py_DECREF(iobj);
13407 if (res == NULL)
13408 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013409 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013410 return 0;
13411
13412wrongtype:
13413 PyErr_Format(PyExc_TypeError,
13414 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013415 "not %.200s",
13416 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013417 return -1;
13418}
13419
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013420static Py_UCS4
13421formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013423 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013424 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013425 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013426 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013427 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013428 goto onError;
13429 }
13430 else {
13431 /* Integer input truncated to a character */
13432 long x;
13433 x = PyLong_AsLong(v);
13434 if (x == -1 && PyErr_Occurred())
13435 goto onError;
13436
Victor Stinner8faf8212011-12-08 22:14:11 +010013437 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013438 PyErr_SetString(PyExc_OverflowError,
13439 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013440 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013441 }
13442
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013443 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013444 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013445
Benjamin Peterson29060642009-01-31 22:14:21 +000013446 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013447 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013449 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013450}
13451
Victor Stinnera47082312012-10-04 02:19:54 +020013452/* Parse options of an argument: flags, width, precision.
13453 Handle also "%(name)" syntax.
13454
13455 Return 0 if the argument has been formatted into arg->str.
13456 Return 1 if the argument has been written into ctx->writer,
13457 Raise an exception and return -1 on error. */
13458static int
13459unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13460 struct unicode_format_arg_t *arg)
13461{
13462#define FORMAT_READ(ctx) \
13463 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13464
13465 PyObject *v;
13466
Victor Stinnera47082312012-10-04 02:19:54 +020013467 if (arg->ch == '(') {
13468 /* Get argument value from a dictionary. Example: "%(name)s". */
13469 Py_ssize_t keystart;
13470 Py_ssize_t keylen;
13471 PyObject *key;
13472 int pcount = 1;
13473
13474 if (ctx->dict == NULL) {
13475 PyErr_SetString(PyExc_TypeError,
13476 "format requires a mapping");
13477 return -1;
13478 }
13479 ++ctx->fmtpos;
13480 --ctx->fmtcnt;
13481 keystart = ctx->fmtpos;
13482 /* Skip over balanced parentheses */
13483 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13484 arg->ch = FORMAT_READ(ctx);
13485 if (arg->ch == ')')
13486 --pcount;
13487 else if (arg->ch == '(')
13488 ++pcount;
13489 ctx->fmtpos++;
13490 }
13491 keylen = ctx->fmtpos - keystart - 1;
13492 if (ctx->fmtcnt < 0 || pcount > 0) {
13493 PyErr_SetString(PyExc_ValueError,
13494 "incomplete format key");
13495 return -1;
13496 }
13497 key = PyUnicode_Substring(ctx->fmtstr,
13498 keystart, keystart + keylen);
13499 if (key == NULL)
13500 return -1;
13501 if (ctx->args_owned) {
13502 Py_DECREF(ctx->args);
13503 ctx->args_owned = 0;
13504 }
13505 ctx->args = PyObject_GetItem(ctx->dict, key);
13506 Py_DECREF(key);
13507 if (ctx->args == NULL)
13508 return -1;
13509 ctx->args_owned = 1;
13510 ctx->arglen = -1;
13511 ctx->argidx = -2;
13512 }
13513
13514 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013515 while (--ctx->fmtcnt >= 0) {
13516 arg->ch = FORMAT_READ(ctx);
13517 ctx->fmtpos++;
13518 switch (arg->ch) {
13519 case '-': arg->flags |= F_LJUST; continue;
13520 case '+': arg->flags |= F_SIGN; continue;
13521 case ' ': arg->flags |= F_BLANK; continue;
13522 case '#': arg->flags |= F_ALT; continue;
13523 case '0': arg->flags |= F_ZERO; continue;
13524 }
13525 break;
13526 }
13527
13528 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013529 if (arg->ch == '*') {
13530 v = unicode_format_getnextarg(ctx);
13531 if (v == NULL)
13532 return -1;
13533 if (!PyLong_Check(v)) {
13534 PyErr_SetString(PyExc_TypeError,
13535 "* wants int");
13536 return -1;
13537 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013538 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013539 if (arg->width == -1 && PyErr_Occurred())
13540 return -1;
13541 if (arg->width < 0) {
13542 arg->flags |= F_LJUST;
13543 arg->width = -arg->width;
13544 }
13545 if (--ctx->fmtcnt >= 0) {
13546 arg->ch = FORMAT_READ(ctx);
13547 ctx->fmtpos++;
13548 }
13549 }
13550 else if (arg->ch >= '0' && arg->ch <= '9') {
13551 arg->width = arg->ch - '0';
13552 while (--ctx->fmtcnt >= 0) {
13553 arg->ch = FORMAT_READ(ctx);
13554 ctx->fmtpos++;
13555 if (arg->ch < '0' || arg->ch > '9')
13556 break;
13557 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13558 mixing signed and unsigned comparison. Since arg->ch is between
13559 '0' and '9', casting to int is safe. */
13560 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13561 PyErr_SetString(PyExc_ValueError,
13562 "width too big");
13563 return -1;
13564 }
13565 arg->width = arg->width*10 + (arg->ch - '0');
13566 }
13567 }
13568
13569 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020013570 if (arg->ch == '.') {
13571 arg->prec = 0;
13572 if (--ctx->fmtcnt >= 0) {
13573 arg->ch = FORMAT_READ(ctx);
13574 ctx->fmtpos++;
13575 }
13576 if (arg->ch == '*') {
13577 v = unicode_format_getnextarg(ctx);
13578 if (v == NULL)
13579 return -1;
13580 if (!PyLong_Check(v)) {
13581 PyErr_SetString(PyExc_TypeError,
13582 "* wants int");
13583 return -1;
13584 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013585 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013586 if (arg->prec == -1 && PyErr_Occurred())
13587 return -1;
13588 if (arg->prec < 0)
13589 arg->prec = 0;
13590 if (--ctx->fmtcnt >= 0) {
13591 arg->ch = FORMAT_READ(ctx);
13592 ctx->fmtpos++;
13593 }
13594 }
13595 else if (arg->ch >= '0' && arg->ch <= '9') {
13596 arg->prec = arg->ch - '0';
13597 while (--ctx->fmtcnt >= 0) {
13598 arg->ch = FORMAT_READ(ctx);
13599 ctx->fmtpos++;
13600 if (arg->ch < '0' || arg->ch > '9')
13601 break;
13602 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13603 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013604 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013605 return -1;
13606 }
13607 arg->prec = arg->prec*10 + (arg->ch - '0');
13608 }
13609 }
13610 }
13611
13612 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13613 if (ctx->fmtcnt >= 0) {
13614 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13615 if (--ctx->fmtcnt >= 0) {
13616 arg->ch = FORMAT_READ(ctx);
13617 ctx->fmtpos++;
13618 }
13619 }
13620 }
13621 if (ctx->fmtcnt < 0) {
13622 PyErr_SetString(PyExc_ValueError,
13623 "incomplete format");
13624 return -1;
13625 }
13626 return 0;
13627
13628#undef FORMAT_READ
13629}
13630
13631/* Format one argument. Supported conversion specifiers:
13632
13633 - "s", "r", "a": any type
13634 - "i", "d", "u", "o", "x", "X": int
13635 - "e", "E", "f", "F", "g", "G": float
13636 - "c": int or str (1 character)
13637
Victor Stinner8dbd4212012-12-04 09:30:24 +010013638 When possible, the output is written directly into the Unicode writer
13639 (ctx->writer). A string is created when padding is required.
13640
Victor Stinnera47082312012-10-04 02:19:54 +020013641 Return 0 if the argument has been formatted into *p_str,
13642 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010013643 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020013644static int
13645unicode_format_arg_format(struct unicode_formatter_t *ctx,
13646 struct unicode_format_arg_t *arg,
13647 PyObject **p_str)
13648{
13649 PyObject *v;
13650 _PyUnicodeWriter *writer = &ctx->writer;
13651
13652 if (ctx->fmtcnt == 0)
13653 ctx->writer.overallocate = 0;
13654
13655 if (arg->ch == '%') {
13656 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13657 return -1;
13658 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13659 writer->pos += 1;
13660 return 1;
13661 }
13662
13663 v = unicode_format_getnextarg(ctx);
13664 if (v == NULL)
13665 return -1;
13666
Victor Stinnera47082312012-10-04 02:19:54 +020013667
13668 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020013669 case 's':
13670 case 'r':
13671 case 'a':
13672 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13673 /* Fast path */
13674 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13675 return -1;
13676 return 1;
13677 }
13678
13679 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13680 *p_str = v;
13681 Py_INCREF(*p_str);
13682 }
13683 else {
13684 if (arg->ch == 's')
13685 *p_str = PyObject_Str(v);
13686 else if (arg->ch == 'r')
13687 *p_str = PyObject_Repr(v);
13688 else
13689 *p_str = PyObject_ASCII(v);
13690 }
13691 break;
13692
13693 case 'i':
13694 case 'd':
13695 case 'u':
13696 case 'o':
13697 case 'x':
13698 case 'X':
13699 {
13700 int ret = mainformatlong(v, arg, p_str, writer);
13701 if (ret != 0)
13702 return ret;
13703 arg->sign = 1;
13704 break;
13705 }
13706
13707 case 'e':
13708 case 'E':
13709 case 'f':
13710 case 'F':
13711 case 'g':
13712 case 'G':
13713 if (arg->width == -1 && arg->prec == -1
13714 && !(arg->flags & (F_SIGN | F_BLANK)))
13715 {
13716 /* Fast path */
13717 if (formatfloat(v, arg, NULL, writer) == -1)
13718 return -1;
13719 return 1;
13720 }
13721
13722 arg->sign = 1;
13723 if (formatfloat(v, arg, p_str, NULL) == -1)
13724 return -1;
13725 break;
13726
13727 case 'c':
13728 {
13729 Py_UCS4 ch = formatchar(v);
13730 if (ch == (Py_UCS4) -1)
13731 return -1;
13732 if (arg->width == -1 && arg->prec == -1) {
13733 /* Fast path */
13734 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13735 return -1;
13736 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13737 writer->pos += 1;
13738 return 1;
13739 }
13740 *p_str = PyUnicode_FromOrdinal(ch);
13741 break;
13742 }
13743
13744 default:
13745 PyErr_Format(PyExc_ValueError,
13746 "unsupported format character '%c' (0x%x) "
13747 "at index %zd",
13748 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13749 (int)arg->ch,
13750 ctx->fmtpos - 1);
13751 return -1;
13752 }
13753 if (*p_str == NULL)
13754 return -1;
13755 assert (PyUnicode_Check(*p_str));
13756 return 0;
13757}
13758
13759static int
13760unicode_format_arg_output(struct unicode_formatter_t *ctx,
13761 struct unicode_format_arg_t *arg,
13762 PyObject *str)
13763{
13764 Py_ssize_t len;
13765 enum PyUnicode_Kind kind;
13766 void *pbuf;
13767 Py_ssize_t pindex;
13768 Py_UCS4 signchar;
13769 Py_ssize_t buflen;
13770 Py_UCS4 maxchar, bufmaxchar;
13771 Py_ssize_t sublen;
13772 _PyUnicodeWriter *writer = &ctx->writer;
13773 Py_UCS4 fill;
13774
13775 fill = ' ';
13776 if (arg->sign && arg->flags & F_ZERO)
13777 fill = '0';
13778
13779 if (PyUnicode_READY(str) == -1)
13780 return -1;
13781
13782 len = PyUnicode_GET_LENGTH(str);
13783 if ((arg->width == -1 || arg->width <= len)
13784 && (arg->prec == -1 || arg->prec >= len)
13785 && !(arg->flags & (F_SIGN | F_BLANK)))
13786 {
13787 /* Fast path */
13788 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13789 return -1;
13790 return 0;
13791 }
13792
13793 /* Truncate the string for "s", "r" and "a" formats
13794 if the precision is set */
13795 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13796 if (arg->prec >= 0 && len > arg->prec)
13797 len = arg->prec;
13798 }
13799
13800 /* Adjust sign and width */
13801 kind = PyUnicode_KIND(str);
13802 pbuf = PyUnicode_DATA(str);
13803 pindex = 0;
13804 signchar = '\0';
13805 if (arg->sign) {
13806 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13807 if (ch == '-' || ch == '+') {
13808 signchar = ch;
13809 len--;
13810 pindex++;
13811 }
13812 else if (arg->flags & F_SIGN)
13813 signchar = '+';
13814 else if (arg->flags & F_BLANK)
13815 signchar = ' ';
13816 else
13817 arg->sign = 0;
13818 }
13819 if (arg->width < len)
13820 arg->width = len;
13821
13822 /* Prepare the writer */
13823 bufmaxchar = 127;
13824 if (!(arg->flags & F_LJUST)) {
13825 if (arg->sign) {
13826 if ((arg->width-1) > len)
13827 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13828 }
13829 else {
13830 if (arg->width > len)
13831 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13832 }
13833 }
13834 maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13835 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13836 buflen = arg->width;
13837 if (arg->sign && len == arg->width)
13838 buflen++;
13839 if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1)
13840 return -1;
13841
13842 /* Write the sign if needed */
13843 if (arg->sign) {
13844 if (fill != ' ') {
13845 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13846 writer->pos += 1;
13847 }
13848 if (arg->width > len)
13849 arg->width--;
13850 }
13851
13852 /* Write the numeric prefix for "x", "X" and "o" formats
13853 if the alternate form is used.
13854 For example, write "0x" for the "%#x" format. */
13855 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13856 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13857 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
13858 if (fill != ' ') {
13859 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13860 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13861 writer->pos += 2;
13862 pindex += 2;
13863 }
13864 arg->width -= 2;
13865 if (arg->width < 0)
13866 arg->width = 0;
13867 len -= 2;
13868 }
13869
13870 /* Pad left with the fill character if needed */
13871 if (arg->width > len && !(arg->flags & F_LJUST)) {
13872 sublen = arg->width - len;
13873 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
13874 writer->pos += sublen;
13875 arg->width = len;
13876 }
13877
13878 /* If padding with spaces: write sign if needed and/or numeric prefix if
13879 the alternate form is used */
13880 if (fill == ' ') {
13881 if (arg->sign) {
13882 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13883 writer->pos += 1;
13884 }
13885 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13886 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13887 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
13888 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13889 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13890 writer->pos += 2;
13891 pindex += 2;
13892 }
13893 }
13894
13895 /* Write characters */
13896 if (len) {
13897 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13898 str, pindex, len);
13899 writer->pos += len;
13900 }
13901
13902 /* Pad right with the fill character if needed */
13903 if (arg->width > len) {
13904 sublen = arg->width - len;
13905 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
13906 writer->pos += sublen;
13907 }
13908 return 0;
13909}
13910
13911/* Helper of PyUnicode_Format(): format one arg.
13912 Return 0 on success, raise an exception and return -1 on error. */
13913static int
13914unicode_format_arg(struct unicode_formatter_t *ctx)
13915{
13916 struct unicode_format_arg_t arg;
13917 PyObject *str;
13918 int ret;
13919
Victor Stinner8dbd4212012-12-04 09:30:24 +010013920 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
13921 arg.flags = 0;
13922 arg.width = -1;
13923 arg.prec = -1;
13924 arg.sign = 0;
13925 str = NULL;
13926
Victor Stinnera47082312012-10-04 02:19:54 +020013927 ret = unicode_format_arg_parse(ctx, &arg);
13928 if (ret == -1)
13929 return -1;
13930
13931 ret = unicode_format_arg_format(ctx, &arg, &str);
13932 if (ret == -1)
13933 return -1;
13934
13935 if (ret != 1) {
13936 ret = unicode_format_arg_output(ctx, &arg, str);
13937 Py_DECREF(str);
13938 if (ret == -1)
13939 return -1;
13940 }
13941
13942 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
13943 PyErr_SetString(PyExc_TypeError,
13944 "not all arguments converted during string formatting");
13945 return -1;
13946 }
13947 return 0;
13948}
13949
Alexander Belopolsky40018472011-02-26 01:02:56 +000013950PyObject *
13951PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013952{
Victor Stinnera47082312012-10-04 02:19:54 +020013953 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000013954
Guido van Rossumd57fd912000-03-10 22:53:23 +000013955 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013956 PyErr_BadInternalCall();
13957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013958 }
Victor Stinnera47082312012-10-04 02:19:54 +020013959
13960 ctx.fmtstr = PyUnicode_FromObject(format);
13961 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013962 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020013963 if (PyUnicode_READY(ctx.fmtstr) == -1) {
13964 Py_DECREF(ctx.fmtstr);
13965 return NULL;
13966 }
13967 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
13968 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
13969 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
13970 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013971
Victor Stinnera47082312012-10-04 02:19:54 +020013972 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013973
Guido van Rossumd57fd912000-03-10 22:53:23 +000013974 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020013975 ctx.arglen = PyTuple_Size(args);
13976 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013977 }
13978 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013979 ctx.arglen = -1;
13980 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013981 }
Victor Stinnera47082312012-10-04 02:19:54 +020013982 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013983 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020013984 ctx.dict = args;
13985 else
13986 ctx.dict = NULL;
13987 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013988
Victor Stinnera47082312012-10-04 02:19:54 +020013989 while (--ctx.fmtcnt >= 0) {
13990 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13991 Py_ssize_t nonfmtpos, sublen;
13992 Py_UCS4 maxchar;
13993
13994 nonfmtpos = ctx.fmtpos++;
13995 while (ctx.fmtcnt >= 0 &&
13996 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13997 ctx.fmtpos++;
13998 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013999 }
Victor Stinnera47082312012-10-04 02:19:54 +020014000 if (ctx.fmtcnt < 0) {
14001 ctx.fmtpos--;
14002 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014003 }
Victor Stinnera47082312012-10-04 02:19:54 +020014004 sublen = ctx.fmtpos - nonfmtpos;
14005 maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr,
Victor Stinneree4544c2012-05-09 22:24:08 +020014006 nonfmtpos, nonfmtpos + sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014007 if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014008 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020014009
Victor Stinnera47082312012-10-04 02:19:54 +020014010 _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos,
14011 ctx.fmtstr, nonfmtpos, sublen);
14012 ctx.writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014013 }
14014 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014015 ctx.fmtpos++;
14016 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014017 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014018 }
14019 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014020
Victor Stinnera47082312012-10-04 02:19:54 +020014021 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014022 PyErr_SetString(PyExc_TypeError,
14023 "not all arguments converted during string formatting");
14024 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014025 }
14026
Victor Stinnera47082312012-10-04 02:19:54 +020014027 if (ctx.args_owned) {
14028 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014029 }
Victor Stinnera47082312012-10-04 02:19:54 +020014030 Py_DECREF(ctx.fmtstr);
14031 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014032
Benjamin Peterson29060642009-01-31 22:14:21 +000014033 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014034 Py_DECREF(ctx.fmtstr);
14035 _PyUnicodeWriter_Dealloc(&ctx.writer);
14036 if (ctx.args_owned) {
14037 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014038 }
14039 return NULL;
14040}
14041
Jeremy Hylton938ace62002-07-17 16:30:39 +000014042static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014043unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14044
Tim Peters6d6c1a32001-08-02 04:15:00 +000014045static PyObject *
14046unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14047{
Benjamin Peterson29060642009-01-31 22:14:21 +000014048 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014049 static char *kwlist[] = {"object", "encoding", "errors", 0};
14050 char *encoding = NULL;
14051 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014052
Benjamin Peterson14339b62009-01-31 16:36:08 +000014053 if (type != &PyUnicode_Type)
14054 return unicode_subtype_new(type, args, kwds);
14055 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014056 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014057 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014058 if (x == NULL) {
14059 Py_INCREF(unicode_empty);
14060 return unicode_empty;
14061 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014062 if (encoding == NULL && errors == NULL)
14063 return PyObject_Str(x);
14064 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014065 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014066}
14067
Guido van Rossume023fe02001-08-30 03:12:59 +000014068static PyObject *
14069unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14070{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014071 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014072 Py_ssize_t length, char_size;
14073 int share_wstr, share_utf8;
14074 unsigned int kind;
14075 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014076
Benjamin Peterson14339b62009-01-31 16:36:08 +000014077 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014078
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014079 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014080 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014081 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014082 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014083 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014084 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014085 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014086 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014087
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014088 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014089 if (self == NULL) {
14090 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014091 return NULL;
14092 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014093 kind = PyUnicode_KIND(unicode);
14094 length = PyUnicode_GET_LENGTH(unicode);
14095
14096 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014097#ifdef Py_DEBUG
14098 _PyUnicode_HASH(self) = -1;
14099#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014100 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014101#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014102 _PyUnicode_STATE(self).interned = 0;
14103 _PyUnicode_STATE(self).kind = kind;
14104 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014105 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014106 _PyUnicode_STATE(self).ready = 1;
14107 _PyUnicode_WSTR(self) = NULL;
14108 _PyUnicode_UTF8_LENGTH(self) = 0;
14109 _PyUnicode_UTF8(self) = NULL;
14110 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014111 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014112
14113 share_utf8 = 0;
14114 share_wstr = 0;
14115 if (kind == PyUnicode_1BYTE_KIND) {
14116 char_size = 1;
14117 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14118 share_utf8 = 1;
14119 }
14120 else if (kind == PyUnicode_2BYTE_KIND) {
14121 char_size = 2;
14122 if (sizeof(wchar_t) == 2)
14123 share_wstr = 1;
14124 }
14125 else {
14126 assert(kind == PyUnicode_4BYTE_KIND);
14127 char_size = 4;
14128 if (sizeof(wchar_t) == 4)
14129 share_wstr = 1;
14130 }
14131
14132 /* Ensure we won't overflow the length. */
14133 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14134 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014135 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014136 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014137 data = PyObject_MALLOC((length + 1) * char_size);
14138 if (data == NULL) {
14139 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014140 goto onError;
14141 }
14142
Victor Stinnerc3c74152011-10-02 20:39:55 +020014143 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014144 if (share_utf8) {
14145 _PyUnicode_UTF8_LENGTH(self) = length;
14146 _PyUnicode_UTF8(self) = data;
14147 }
14148 if (share_wstr) {
14149 _PyUnicode_WSTR_LENGTH(self) = length;
14150 _PyUnicode_WSTR(self) = (wchar_t *)data;
14151 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014152
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014153 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014154 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014155 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014156#ifdef Py_DEBUG
14157 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14158#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014159 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014160 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014161
14162onError:
14163 Py_DECREF(unicode);
14164 Py_DECREF(self);
14165 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014166}
14167
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014168PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014169"str(object='') -> str\n\
14170str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014171\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014172Create a new string object from the given object. If encoding or\n\
14173errors is specified, then the object must expose a data buffer\n\
14174that will be decoded using the given encoding and error handler.\n\
14175Otherwise, returns the result of object.__str__() (if defined)\n\
14176or repr(object).\n\
14177encoding defaults to sys.getdefaultencoding().\n\
14178errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014179
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014180static PyObject *unicode_iter(PyObject *seq);
14181
Guido van Rossumd57fd912000-03-10 22:53:23 +000014182PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014183 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014184 "str", /* tp_name */
14185 sizeof(PyUnicodeObject), /* tp_size */
14186 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014187 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014188 (destructor)unicode_dealloc, /* tp_dealloc */
14189 0, /* tp_print */
14190 0, /* tp_getattr */
14191 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014192 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014193 unicode_repr, /* tp_repr */
14194 &unicode_as_number, /* tp_as_number */
14195 &unicode_as_sequence, /* tp_as_sequence */
14196 &unicode_as_mapping, /* tp_as_mapping */
14197 (hashfunc) unicode_hash, /* tp_hash*/
14198 0, /* tp_call*/
14199 (reprfunc) unicode_str, /* tp_str */
14200 PyObject_GenericGetAttr, /* tp_getattro */
14201 0, /* tp_setattro */
14202 0, /* tp_as_buffer */
14203 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014204 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014205 unicode_doc, /* tp_doc */
14206 0, /* tp_traverse */
14207 0, /* tp_clear */
14208 PyUnicode_RichCompare, /* tp_richcompare */
14209 0, /* tp_weaklistoffset */
14210 unicode_iter, /* tp_iter */
14211 0, /* tp_iternext */
14212 unicode_methods, /* tp_methods */
14213 0, /* tp_members */
14214 0, /* tp_getset */
14215 &PyBaseObject_Type, /* tp_base */
14216 0, /* tp_dict */
14217 0, /* tp_descr_get */
14218 0, /* tp_descr_set */
14219 0, /* tp_dictoffset */
14220 0, /* tp_init */
14221 0, /* tp_alloc */
14222 unicode_new, /* tp_new */
14223 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014224};
14225
14226/* Initialize the Unicode implementation */
14227
Victor Stinner3a50e702011-10-18 21:21:00 +020014228int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014229{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014230 int i;
14231
Thomas Wouters477c8d52006-05-27 19:21:47 +000014232 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014233 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014234 0x000A, /* LINE FEED */
14235 0x000D, /* CARRIAGE RETURN */
14236 0x001C, /* FILE SEPARATOR */
14237 0x001D, /* GROUP SEPARATOR */
14238 0x001E, /* RECORD SEPARATOR */
14239 0x0085, /* NEXT LINE */
14240 0x2028, /* LINE SEPARATOR */
14241 0x2029, /* PARAGRAPH SEPARATOR */
14242 };
14243
Fred Drakee4315f52000-05-09 19:53:39 +000014244 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014245 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014246 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014247 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014248 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014249
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014250 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014251 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014252 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014253 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014254
14255 /* initialize the linebreak bloom filter */
14256 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014257 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014258 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014259
14260 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014261
Benjamin Petersonc4311282012-10-30 23:21:10 -040014262 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14263 Py_FatalError("Can't initialize field name iterator type");
14264
14265 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14266 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014267
Victor Stinner3a50e702011-10-18 21:21:00 +020014268#ifdef HAVE_MBCS
14269 winver.dwOSVersionInfoSize = sizeof(winver);
14270 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14271 PyErr_SetFromWindowsErr(0);
14272 return -1;
14273 }
14274#endif
14275 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014276}
14277
14278/* Finalize the Unicode implementation */
14279
Christian Heimesa156e092008-02-16 07:38:31 +000014280int
14281PyUnicode_ClearFreeList(void)
14282{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014283 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014284}
14285
Guido van Rossumd57fd912000-03-10 22:53:23 +000014286void
Thomas Wouters78890102000-07-22 19:25:51 +000014287_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014288{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014289 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014290
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014291 Py_XDECREF(unicode_empty);
14292 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014293
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014294 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014295 if (unicode_latin1[i]) {
14296 Py_DECREF(unicode_latin1[i]);
14297 unicode_latin1[i] = NULL;
14298 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014299 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014300 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014301 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014302}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014303
Walter Dörwald16807132007-05-25 13:52:07 +000014304void
14305PyUnicode_InternInPlace(PyObject **p)
14306{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014307 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014308 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014309#ifdef Py_DEBUG
14310 assert(s != NULL);
14311 assert(_PyUnicode_CHECK(s));
14312#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014313 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014314 return;
14315#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014316 /* If it's a subclass, we don't really know what putting
14317 it in the interned dict might do. */
14318 if (!PyUnicode_CheckExact(s))
14319 return;
14320 if (PyUnicode_CHECK_INTERNED(s))
14321 return;
14322 if (interned == NULL) {
14323 interned = PyDict_New();
14324 if (interned == NULL) {
14325 PyErr_Clear(); /* Don't leave an exception */
14326 return;
14327 }
14328 }
14329 /* It might be that the GetItem call fails even
14330 though the key is present in the dictionary,
14331 namely when this happens during a stack overflow. */
14332 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014333 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014334 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014335
Benjamin Peterson29060642009-01-31 22:14:21 +000014336 if (t) {
14337 Py_INCREF(t);
14338 Py_DECREF(*p);
14339 *p = t;
14340 return;
14341 }
Walter Dörwald16807132007-05-25 13:52:07 +000014342
Benjamin Peterson14339b62009-01-31 16:36:08 +000014343 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014344 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014345 PyErr_Clear();
14346 PyThreadState_GET()->recursion_critical = 0;
14347 return;
14348 }
14349 PyThreadState_GET()->recursion_critical = 0;
14350 /* The two references in interned are not counted by refcnt.
14351 The deallocator will take care of this */
14352 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014353 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014354}
14355
14356void
14357PyUnicode_InternImmortal(PyObject **p)
14358{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014359 PyUnicode_InternInPlace(p);
14360 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014361 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014362 Py_INCREF(*p);
14363 }
Walter Dörwald16807132007-05-25 13:52:07 +000014364}
14365
14366PyObject *
14367PyUnicode_InternFromString(const char *cp)
14368{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014369 PyObject *s = PyUnicode_FromString(cp);
14370 if (s == NULL)
14371 return NULL;
14372 PyUnicode_InternInPlace(&s);
14373 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014374}
14375
Alexander Belopolsky40018472011-02-26 01:02:56 +000014376void
14377_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014378{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014379 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014380 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014381 Py_ssize_t i, n;
14382 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014383
Benjamin Peterson14339b62009-01-31 16:36:08 +000014384 if (interned == NULL || !PyDict_Check(interned))
14385 return;
14386 keys = PyDict_Keys(interned);
14387 if (keys == NULL || !PyList_Check(keys)) {
14388 PyErr_Clear();
14389 return;
14390 }
Walter Dörwald16807132007-05-25 13:52:07 +000014391
Benjamin Peterson14339b62009-01-31 16:36:08 +000014392 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14393 detector, interned unicode strings are not forcibly deallocated;
14394 rather, we give them their stolen references back, and then clear
14395 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014396
Benjamin Peterson14339b62009-01-31 16:36:08 +000014397 n = PyList_GET_SIZE(keys);
14398 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014399 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014400 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014401 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014402 if (PyUnicode_READY(s) == -1) {
14403 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014404 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014405 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014406 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014407 case SSTATE_NOT_INTERNED:
14408 /* XXX Shouldn't happen */
14409 break;
14410 case SSTATE_INTERNED_IMMORTAL:
14411 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014412 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014413 break;
14414 case SSTATE_INTERNED_MORTAL:
14415 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014416 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014417 break;
14418 default:
14419 Py_FatalError("Inconsistent interned string state.");
14420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014421 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014422 }
14423 fprintf(stderr, "total size of all interned strings: "
14424 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14425 "mortal/immortal\n", mortal_size, immortal_size);
14426 Py_DECREF(keys);
14427 PyDict_Clear(interned);
14428 Py_DECREF(interned);
14429 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014430}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014431
14432
14433/********************* Unicode Iterator **************************/
14434
14435typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014436 PyObject_HEAD
14437 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014438 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014439} unicodeiterobject;
14440
14441static void
14442unicodeiter_dealloc(unicodeiterobject *it)
14443{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014444 _PyObject_GC_UNTRACK(it);
14445 Py_XDECREF(it->it_seq);
14446 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014447}
14448
14449static int
14450unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14451{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014452 Py_VISIT(it->it_seq);
14453 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014454}
14455
14456static PyObject *
14457unicodeiter_next(unicodeiterobject *it)
14458{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014459 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014460
Benjamin Peterson14339b62009-01-31 16:36:08 +000014461 assert(it != NULL);
14462 seq = it->it_seq;
14463 if (seq == NULL)
14464 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014465 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014467 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14468 int kind = PyUnicode_KIND(seq);
14469 void *data = PyUnicode_DATA(seq);
14470 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14471 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014472 if (item != NULL)
14473 ++it->it_index;
14474 return item;
14475 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014476
Benjamin Peterson14339b62009-01-31 16:36:08 +000014477 Py_DECREF(seq);
14478 it->it_seq = NULL;
14479 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014480}
14481
14482static PyObject *
14483unicodeiter_len(unicodeiterobject *it)
14484{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014485 Py_ssize_t len = 0;
14486 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014487 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014488 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014489}
14490
14491PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14492
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014493static PyObject *
14494unicodeiter_reduce(unicodeiterobject *it)
14495{
14496 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014497 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014498 it->it_seq, it->it_index);
14499 } else {
14500 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14501 if (u == NULL)
14502 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014503 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014504 }
14505}
14506
14507PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14508
14509static PyObject *
14510unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14511{
14512 Py_ssize_t index = PyLong_AsSsize_t(state);
14513 if (index == -1 && PyErr_Occurred())
14514 return NULL;
14515 if (index < 0)
14516 index = 0;
14517 it->it_index = index;
14518 Py_RETURN_NONE;
14519}
14520
14521PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14522
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014523static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014524 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014525 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014526 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14527 reduce_doc},
14528 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14529 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014530 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014531};
14532
14533PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014534 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14535 "str_iterator", /* tp_name */
14536 sizeof(unicodeiterobject), /* tp_basicsize */
14537 0, /* tp_itemsize */
14538 /* methods */
14539 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14540 0, /* tp_print */
14541 0, /* tp_getattr */
14542 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014543 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014544 0, /* tp_repr */
14545 0, /* tp_as_number */
14546 0, /* tp_as_sequence */
14547 0, /* tp_as_mapping */
14548 0, /* tp_hash */
14549 0, /* tp_call */
14550 0, /* tp_str */
14551 PyObject_GenericGetAttr, /* tp_getattro */
14552 0, /* tp_setattro */
14553 0, /* tp_as_buffer */
14554 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14555 0, /* tp_doc */
14556 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14557 0, /* tp_clear */
14558 0, /* tp_richcompare */
14559 0, /* tp_weaklistoffset */
14560 PyObject_SelfIter, /* tp_iter */
14561 (iternextfunc)unicodeiter_next, /* tp_iternext */
14562 unicodeiter_methods, /* tp_methods */
14563 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014564};
14565
14566static PyObject *
14567unicode_iter(PyObject *seq)
14568{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014569 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014570
Benjamin Peterson14339b62009-01-31 16:36:08 +000014571 if (!PyUnicode_Check(seq)) {
14572 PyErr_BadInternalCall();
14573 return NULL;
14574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014575 if (PyUnicode_READY(seq) == -1)
14576 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014577 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14578 if (it == NULL)
14579 return NULL;
14580 it->it_index = 0;
14581 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014582 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014583 _PyObject_GC_TRACK(it);
14584 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014585}
14586
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014587
14588size_t
14589Py_UNICODE_strlen(const Py_UNICODE *u)
14590{
14591 int res = 0;
14592 while(*u++)
14593 res++;
14594 return res;
14595}
14596
14597Py_UNICODE*
14598Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14599{
14600 Py_UNICODE *u = s1;
14601 while ((*u++ = *s2++));
14602 return s1;
14603}
14604
14605Py_UNICODE*
14606Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14607{
14608 Py_UNICODE *u = s1;
14609 while ((*u++ = *s2++))
14610 if (n-- == 0)
14611 break;
14612 return s1;
14613}
14614
14615Py_UNICODE*
14616Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14617{
14618 Py_UNICODE *u1 = s1;
14619 u1 += Py_UNICODE_strlen(u1);
14620 Py_UNICODE_strcpy(u1, s2);
14621 return s1;
14622}
14623
14624int
14625Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14626{
14627 while (*s1 && *s2 && *s1 == *s2)
14628 s1++, s2++;
14629 if (*s1 && *s2)
14630 return (*s1 < *s2) ? -1 : +1;
14631 if (*s1)
14632 return 1;
14633 if (*s2)
14634 return -1;
14635 return 0;
14636}
14637
14638int
14639Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14640{
14641 register Py_UNICODE u1, u2;
14642 for (; n != 0; n--) {
14643 u1 = *s1;
14644 u2 = *s2;
14645 if (u1 != u2)
14646 return (u1 < u2) ? -1 : +1;
14647 if (u1 == '\0')
14648 return 0;
14649 s1++;
14650 s2++;
14651 }
14652 return 0;
14653}
14654
14655Py_UNICODE*
14656Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14657{
14658 const Py_UNICODE *p;
14659 for (p = s; *p; p++)
14660 if (*p == c)
14661 return (Py_UNICODE*)p;
14662 return NULL;
14663}
14664
14665Py_UNICODE*
14666Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14667{
14668 const Py_UNICODE *p;
14669 p = s + Py_UNICODE_strlen(s);
14670 while (p != s) {
14671 p--;
14672 if (*p == c)
14673 return (Py_UNICODE*)p;
14674 }
14675 return NULL;
14676}
Victor Stinner331ea922010-08-10 16:37:20 +000014677
Victor Stinner71133ff2010-09-01 23:43:53 +000014678Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014679PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014680{
Victor Stinner577db2c2011-10-11 22:12:48 +020014681 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014682 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014684 if (!PyUnicode_Check(unicode)) {
14685 PyErr_BadArgument();
14686 return NULL;
14687 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014688 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014689 if (u == NULL)
14690 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014691 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014692 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014693 PyErr_NoMemory();
14694 return NULL;
14695 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014696 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014697 size *= sizeof(Py_UNICODE);
14698 copy = PyMem_Malloc(size);
14699 if (copy == NULL) {
14700 PyErr_NoMemory();
14701 return NULL;
14702 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014703 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014704 return copy;
14705}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014706
Georg Brandl66c221e2010-10-14 07:04:07 +000014707/* A _string module, to export formatter_parser and formatter_field_name_split
14708 to the string.Formatter class implemented in Python. */
14709
14710static PyMethodDef _string_methods[] = {
14711 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14712 METH_O, PyDoc_STR("split the argument as a field name")},
14713 {"formatter_parser", (PyCFunction) formatter_parser,
14714 METH_O, PyDoc_STR("parse the argument as a format string")},
14715 {NULL, NULL}
14716};
14717
14718static struct PyModuleDef _string_module = {
14719 PyModuleDef_HEAD_INIT,
14720 "_string",
14721 PyDoc_STR("string helper module"),
14722 0,
14723 _string_methods,
14724 NULL,
14725 NULL,
14726 NULL,
14727 NULL
14728};
14729
14730PyMODINIT_FUNC
14731PyInit__string(void)
14732{
14733 return PyModule_Create(&_string_module);
14734}
14735
14736
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014737#ifdef __cplusplus
14738}
14739#endif